In [1]:
import pandas  as pd
import numpy   as np
import xgboost as xgb


# read in the data

train_data = pd.read_csv('train.csv',index_col=0)
test_data  = pd.read_csv('test.csv',index_col=0)

Our first task is to do Feature Exploration and Selection

In [2]:
train_data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
target     = 'SalePrice'

#===========================================================================
#===========================================================================
X_train = train_data.select_dtypes(include=['number']).copy()
X_train = X_train.drop([target], axis=1)
y_train = train_data[target]
X_test  = test_data.select_dtypes(include=['number']).copy()

#===========================================================================
# simple preprocessing: imputation; substitute any 'NaN' with mean value
#===========================================================================
X_train = X_train.fillna(X_train.mean())
X_test  = X_test.fillna(X_test.mean())

#===========================================================================
# set up our regressor. Today we shall be using the random forest
#===========================================================================
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=100, max_depth=10)

#===========================================================================
# perform a scikit-learn Recursive Feature Elimination (RFE)
#===========================================================================
from sklearn.feature_selection import RFE
# here we want only one final feature, we do this to produce a ranking
n_features_to_select = 1
rfe = RFE(regressor, n_features_to_select=n_features_to_select)
rfe.fit(X_train, y_train)

In [7]:
#===========================================================================
# now print out the features in order of ranking
#===========================================================================
from operator import itemgetter
features = X_train.columns.to_list()
for x, y in (sorted(zip(rfe.ranking_ , features), key=itemgetter(0))):
    print(x, y)

1 OverallQual
2 GrLivArea
3 TotalBsmtSF
4 BsmtFinSF1
5 2ndFlrSF
6 GarageArea
7 YearBuilt
8 1stFlrSF
9 GarageCars
10 LotArea
11 YearRemodAdd
12 LotFrontage
13 BsmtUnfSF
14 TotRmsAbvGrd
15 OpenPorchSF
16 GarageYrBlt
17 MasVnrArea
18 WoodDeckSF
19 FullBath
20 OverallCond
21 Fireplaces
22 MoSold
23 MSSubClass
24 YrSold
25 BedroomAbvGr
26 ScreenPorch
27 KitchenAbvGr
28 BsmtFullBath
29 HalfBath
30 EnclosedPorch
31 BsmtFinSF2
32 3SsnPorch
33 BsmtHalfBath
34 PoolArea
35 LowQualFinSF
36 MiscVal


In [8]:
#===========================================================================
# ok, this time let's choose the top 10 featues and use them for the model
#===========================================================================
n_features_to_select = 10
rfe = RFE(regressor, n_features_to_select=n_features_to_select)
rfe.fit(X_train, y_train)

In [9]:
#===========================================================================
# use the model to predict the prices for the test data
#===========================================================================
predictions = rfe.predict(X_test)

In [10]:
#===========================================================================
# write out CSV submission file
#===========================================================================
output = pd.DataFrame({"Id":test_data.index, target:predictions})
output.to_csv('submission.csv', index=False)

In [12]:
rfe.support_

array([False, False,  True,  True, False,  True, False, False,  True,
       False, False,  True,  True,  True, False,  True, False, False,
       False, False, False, False, False, False, False,  True,  True,
       False, False, False, False, False, False, False, False, False])