# House Prices - Advanced Regression Techniques
### Predict sales prices and practice feature engineering, RFs, and gradient boosting

https://www.kaggle.com/c/house-prices-advanced-regression-techniques/overview

# Imports

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split

from category_encoders import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasRegressor
from keras.metrics import RootMeanSquaredError

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# EDA

In [2]:
# Load and preview
df = pd.read_csv('train.csv')
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [3]:
# Drop columns with too many missing value
na = df.isna().sum()

# Drop anything over 600 missing value
df = df.drop(columns=['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'])
# Then Drop na
# df = df.dropna()
for col in df.columns:
    if df[col].dtypes == 'object':
        df[col].fillna('None')

print(f"final data shape: {df.shape}")

final data shape: (1460, 76)


In [4]:
# Split Data
train, val = train_test_split(df, random_state=1)

target = 'SalePrice'

X_train = train.drop(columns=target)
y_train = train[target]
X_val = val.drop(columns=target)
y_val = val[target]

In [5]:
# Preprocess Data
pre = make_pipeline(OrdinalEncoder(), SimpleImputer(), StandardScaler())

X_train_scaled = pre.fit_transform(X_train)
X_val_scaled = pre.transform(X_val)

In [6]:
n_features = X_train_scaled.shape[1]

model = Sequential()
model.add(Dense(128, activation='relu', kernel_initializer='he_normal', input_shape=(n_features,)))
model.add(Dense(128, activation='relu', kernel_initializer='he_normal'))
model.add(Dense(128, activation='relu', kernel_initializer='he_normal'))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mse', metrics=[RootMeanSquaredError()])
model.fit(X_train_scaled, y_train, batch_size=32, epochs=50, validation_data=(X_val_scaled, y_val))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1811be0e070>

# Predict test data and save

In [17]:
test = pd.read_csv('test.csv')
# Drop anything over 600 missing value
test = test.drop(columns=['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'])
# Then Drop na
# df = df.dropna()
for col in test.columns:
    if test[col].dtypes == 'object':
        test[col].fillna('None')

print(f"final data shape: {test.shape}")

y_pred = model.predict(pre.transform(test)).ravel()
print(y_pred)

submit = pd.DataFrame({'Id': pd.read_csv('sample_submission.csv')['Id'], 'SalePrice': y_pred})

submit.to_csv('submission_ver2.csv', index=False)

final data shape: (1459, 75)
[101298.88 148888.56 186576.89 ... 193377.36 106006.48 284044.6 ]
