In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
y = train.SalePrice.values
train.drop(['SalePrice','Id'],axis=1,inplace=True)
test.drop('Id',axis=1,inplace=True)
ntrain = train.shape[0]
ntest = test.shape[0]
all_data = pd.concat((train,test)).reset_index(drop=True)

In [4]:
miss = all_data.isnull().sum()/len(all_data)
miss = miss[miss>0]
miss.sort_values()
y = np.log1p(y)

In [5]:
numeric = train.select_dtypes(include = [np.number])
categoric = train.select_dtypes(exclude = [np.number])

In [6]:
#dealing with missing values
high_missing = ['FireplaceQu','Fence','Alley','MiscFeature','PoolQC']
for i in high_missing:
    all_data[i] = all_data[i].fillna('None')

In [7]:
#Group by neighborhood and fill in missing value by the median LotFrontage of all the neighborhood(GREAT STEP)
all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))

In [8]:
for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    all_data[col] = all_data[col].fillna('None')

In [9]:
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    all_data[col] = all_data[col].fillna(0)

In [10]:
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    all_data[col] = all_data[col].fillna(0)
    
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    all_data[col] = all_data[col].fillna('None')
    
all_data["MasVnrType"] = all_data["MasVnrType"].fillna("None")
all_data["MasVnrArea"] = all_data["MasVnrArea"].fillna(0)

all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])

In [11]:
all_data.drop(['Utilities'], axis=1, inplace=True)

all_data["Functional"] = all_data["Functional"].fillna("Typ")


In [14]:
miss1 = all_data.isnull().sum()
miss1 = miss1[miss1>0]
miss1.sort_values()

Exterior1st    1
Exterior2nd    1
Electrical     1
KitchenQual    1
SaleType       1
dtype: int64

In [15]:
all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0])
all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0])
all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])
all_data['KitchenQual'] = all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])
all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])

In [17]:
all_data.describe(exclude=np.number)

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
count,2919,2919,2919.0,2919,2919,2919,2919,2919,2919,2919,...,2919,2919,2919,2919,2919,2919.0,2919.0,2919.0,2919,2919
unique,5,2,3.0,4,4,5,3,25,9,8,...,7,4,6,6,3,4.0,5.0,5.0,9,6
top,RL,Pave,,Reg,Lvl,Inside,Gtl,NAmes,Norm,Norm,...,Attchd,Unf,TA,TA,Y,,,,WD,Normal
freq,2269,2907,2721.0,1859,2622,2133,2778,443,2511,2889,...,1723,1230,2604,2654,2641,2909.0,2348.0,2814.0,2526,2402


In [18]:
all_data = pd.get_dummies(all_data)

In [20]:
all_data.shape

(2919, 300)

In [21]:
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

In [22]:
numeric_all = all_data.select_dtypes(include=np.number)

In [24]:
train_all = all_data[:ntrain]
test_all = all_data[ntrain:]

In [37]:
#modelling(using XGBoost)
import xgboost as xgb
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
model_xgb.fit(train_all,y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.4603, gamma=0.0468, learning_rate=0.05,
       max_delta_step=0, max_depth=3, min_child_weight=1.7817,
       missing=None, n_estimators=2200, n_jobs=1, nthread=-1,
       objective='reg:linear', random_state=7, reg_alpha=0.464,
       reg_lambda=0.8571, scale_pos_weight=1, seed=None, silent=1,
       subsample=0.5213)

In [40]:
#predict
xgb_pred = np.expm1(model_xgb.predict(test_all))

In [44]:
#score
from sklearn.metrics import mean_squared_error

In [47]:
#submission
test_submit = pd.read_csv('test.csv')
test_ID = test_submit['Id']

In [48]:
test_submit.shape

(1459, 80)

In [51]:
xgb_pred.size

1459

In [52]:
#final_submission
sub = pd.DataFrame()
sub['Id'] = test_ID
sub['SalePrice'] = xgb_pred
sub.to_csv('submission.csv',index=False)