In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_a.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41440 entries, 0 to 41439
Data columns (total 51 columns):
ID                    41440 non-null int64
area                  41440 non-null float64
rentType              41440 non-null object
houseType             41440 non-null object
houseFloor            41440 non-null object
totalFloor            41440 non-null int64
houseToward           41440 non-null object
houseDecoration       41440 non-null object
communityName         41440 non-null object
city                  41440 non-null object
region                41440 non-null object
plate                 41440 non-null object
buildYear             41440 non-null object
saleSecHouseNum       41440 non-null int64
subwayStationNum      41440 non-null int64
busStationNum         41440 non-null int64
interSchoolNum        41440 non-null int64
schoolNum             41440 non-null int64
privateSchoolNum      41440 non-null int64
hospitalNum           41440 non-null int64
drugStoreNum       

In [3]:
train.nunique()

ID                    41440
area                  10353
rentType                  4
houseType               104
houseFloor                3
totalFloor               55
houseToward              10
houseDecoration           4
communityName          4236
city                      1
region                   15
plate                    66
buildYear                80
saleSecHouseNum          28
subwayStationNum         13
busStationNum            59
interSchoolNum            7
schoolNum                44
privateSchoolNum         17
hospitalNum              11
drugStoreNum             42
gymNum                   39
bankNum                  45
shopNum                  56
parkNum                  18
mallNum                  17
superMarketNum           49
totalTradeMoney         704
totalTradeArea          705
tradeMeanPrice          705
tradeSecNum             333
totalNewTradeMoney      558
totalNewTradeArea       533
tradeNewMeanPrice       557
tradeNewNum             157
remainNewNum        

处理思路：面积和租金的异常值应该清理，rentType和houseType应该是强特征，houseFloor和totalFloor可以一起处理成一个量。
houseToward和houseDecoration可以one-hot编码。tradeTime和buildYear组成房屋的年龄age。ID和city应该扔掉。
region到uv之间的量（除去buildYear）主要包含位置特征和月份特征，可以考虑PCA降维。

In [4]:
#处理面积或租金非常不合理的条目
train.drop(train[train.area>200].index,axis=0,inplace=True)
train.drop(train[train.area<10].index,axis=0,inplace=True)

train.drop(train[train.tradeMoney>60000].index,axis=0,inplace=True)
train.drop(train[train.tradeMoney==0].index,axis=0,inplace=True)

In [5]:
train.loc[train.rentType=='--','rentType'] = '未知方式'

train['roomNum'] = [s[:2] for s in train['houseType']]
train['hallNum'] = [s[2:4] for s in train['houseType']]
train['bathroomNum'] = [s[4:] for s in train['houseType']]
del train['houseType']

test['roomNum'] = [s[:2] for s in test['houseType']]
test['hallNum'] = [s[2:4] for s in test['houseType']]
test['bathroomNum'] = [s[4:] for s in test['houseType']]
del test['houseType']

In [6]:
train['houseFloor'] = [5/6 if s=='高' else s for s in train['houseFloor']]
train['houseFloor'] = [0.5 if s=='中' else s for s in train['houseFloor']]
train['houseFloor'] = [1/6 if s=='低' else s for s in train['houseFloor']]
train['floor'] = train['houseFloor']*train['totalFloor']
for s in ['houseFloor','totalFloor']:
    del train[s]

test['houseFloor'] = [5/6 if s=='高' else s for s in test['houseFloor']]
test['houseFloor'] = [0.5 if s=='中' else s for s in test['houseFloor']]
test['houseFloor'] = [1/6 if s=='低' else s for s in test['houseFloor']]
test['floor'] = test['houseFloor']*test['totalFloor']
for s in ['houseFloor','totalFloor']:
    del test[s]

In [7]:
#buildYear均值填充缺失值
train.loc[train['buildYear']=='暂无信息','buildYear'] = 1999  
train['buildYear'] =  train['buildYear'].apply(int)
train['age'] = 2018 - train['buildYear']
for s in ['buildYear','tradeTime','ID','city']:
    del train[s]

test.loc[test['buildYear']=='暂无信息','buildYear'] = 1999  
test['buildYear'] =  test['buildYear'].apply(int)
test['age'] = 2018 - test['buildYear']
for s in ['buildYear','tradeTime','ID','city']:
    del test[s]

In [8]:
#缺失值填充
test['pv'].fillna(train['pv'].dropna().median(),inplace=True)
test['uv'].fillna(train['uv'].dropna().median(),inplace=True)

train['pv'].fillna(train['pv'].dropna().median(),inplace=True)
train['uv'].fillna(train['uv'].dropna().median(),inplace=True)



In [9]:
#小区、地区和板块先扔掉
for s in ['communityName','region','plate']:
    del train[s]
    del test[s]

In [10]:
onehot_list = ['rentType','houseToward','houseDecoration','roomNum','hallNum','bathroomNum']
PCA_list = ['saleSecHouseNum','subwayStationNum','busStationNum','interSchoolNum','schoolNum',
           'privateSchoolNum','hospitalNum','drugStoreNum','gymNum','bankNum','shopNum','parkNum',
           'mallNum','superMarketNum','totalTradeMoney','totalTradeArea','tradeMeanPrice',
           'tradeSecNum','totalNewTradeMoney','totalNewTradeArea','tradeNewMeanPrice',
           'tradeNewNum','remainNewNum','supplyNewNum','supplyLandNum','supplyLandArea',
           'tradeLandNum','tradeLandArea','landTotalPrice','landMeanPrice','totalWorkers',
           'newWorkers', 'residentPopulation', 'pv','uv']

In [11]:
#独热编码
train = pd.get_dummies(columns=onehot_list,data=train)
test = pd.get_dummies(columns=onehot_list,data=test)
del onehot_list

In [12]:
y = train.tradeMoney
del train['tradeMoney']

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_predict
from sklearn.decomposition import PCA

In [14]:
#PCA降维
minmaxscaler=MinMaxScaler()
pca = PCA(n_components=10)

X = minmaxscaler.fit_transform(train[PCA_list])
X = pca.fit_transform(X)

col = ['PCA'+str(s) for s in range(10)]
X = pd.DataFrame(X,index=train.index,columns=col)
train = pd.concat([train,X],axis=1)

for s in PCA_list:
    del train[s]

X = minmaxscaler.transform(test[PCA_list])
X = pca.transform(X)
    
X = pd.DataFrame(X,index=test.index,columns=col)
test = pd.concat([test,X],axis=1)    

for s in PCA_list:
    del test[s]

del PCA_list,col,X

  return self.partial_fit(X, y)


In [15]:
RF = RandomForestRegressor(n_estimators=100,n_jobs=-1)
y_pred = cross_val_predict(RF,train,y,cv=5)
score = 1-np.sum(np.power(y_pred-y,2))/ np.sum(np.power(y-np.mean(y),2))
print(score)

0.8272351902243611


In [16]:
RF.fit(train,y)
np.argsort(RF.feature_importances_,)
train.columns[np.argsort(RF.feature_importances_)]

Index(['bathroomNum_7卫', 'hallNum_4厅', 'roomNum_7室', 'houseToward_东西',
       'roomNum_6室', 'bathroomNum_5卫', 'roomNum_5室', 'bathroomNum_4卫',
       'rentType_合租', 'hallNum_3厅', 'houseToward_北', 'houseToward_西北',
       'houseDecoration_简装', 'houseToward_东南', 'houseToward_东',
       'houseDecoration_毛坯', 'bathroomNum_3卫', 'houseToward_西', 'hallNum_0厅',
       'houseToward_暂无数据', 'houseToward_西南', 'roomNum_4室', 'houseToward_南北',
       'hallNum_1厅', 'bathroomNum_0卫', 'lookNum', 'rentType_未知方式',
       'houseDecoration_其他', 'bathroomNum_2卫', 'rentType_整租', 'houseToward_南',
       'bathroomNum_1卫', 'roomNum_3室', 'roomNum_2室', 'houseDecoration_精装',
       'roomNum_1室', 'PCA9', 'hallNum_2厅', 'PCA3', 'PCA8', 'floor', 'PCA6',
       'PCA4', 'PCA5', 'age', 'PCA2', 'PCA0', 'PCA7', 'PCA1', 'area'],
      dtype='object')

In [17]:
np.argsort(RF.feature_importances_)

array([39, 32, 27,  9, 26, 38, 25, 37,  4, 31, 10, 15, 19,  8,  7, 18, 36,
       14, 28, 13, 16, 24, 12, 29, 33,  1,  6, 17, 35,  5, 11, 34, 23, 22,
       20, 21, 49, 30, 43, 48,  2, 46, 44, 45,  3, 42, 40, 47, 41,  0],
      dtype=int64)