# Housing Prices Competition for Kaggle Learn Users
### Apply what you learned in the Machine Learning course on Kaggle Learn alongside others in the course.

https://www.kaggle.com/c/home-data-for-ml-course/overview

# Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from category_encoders import OrdinalEncoder
from sklearn.impute import SimpleImputer

from xgboost.sklearn import XGBRegressor

from eli5.sklearn import PermutationImportance
from eli5 import show_weights

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# EDA

In [2]:
# Load and preview
df = pd.read_csv('train.csv')
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [3]:
# Drop columns with too many missing value
na = df.isna().sum()
display(na[na > 0]) # show all columns with NaN

# Drop anything over 600 missing value
df = df.drop(columns=['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'])

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [4]:
# Split Data
train, val = train_test_split(df, random_state=1)

target = 'SalePrice'

X_train = train.drop(columns=target)
y_train = train[target]
X_val = val.drop(columns=target)
y_val = val[target]

# Feature Importance

In [5]:
preprocessing = make_pipeline(OrdinalEncoder(), SimpleImputer())

X_train_processed = preprocessing.fit_transform(X_train)
X_val_processed = preprocessing.transform(X_val)

model = XGBRegressor()
model.fit(X_train_processed, y_train)

permuter = PermutationImportance(model, scoring='neg_root_mean_squared_error')
permuter.fit(X_val_processed, y_val)

feature_names = X_val.columns.tolist()

show_weights(
    permuter,
    top=None,
    feature_names=feature_names
)

Weight,Feature
16788.6737  ± 2147.8904,OverallQual
10018.5759  ± 592.5786,GrLivArea
7425.2652  ± 3551.8681,2ndFlrSF
2329.2222  ± 1312.1740,LotArea
1821.6137  ± 234.3672,YearBuilt
1669.3508  ± 468.3899,GarageCars
1495.1301  ± 207.2982,KitchenQual
1288.0591  ± 954.5046,TotalBsmtSF
1111.6270  ± 410.2575,BsmtFinSF1
1015.6246  ± 671.6180,BsmtExposure


# Making models with selected features

In [6]:
# Select features
fi = pd.Series(permuter.feature_importances_, feature_names).sort_values()
selected_features = fi[fi > 100].index.tolist()

X_train_selected = X_train[selected_features]
X_val_selected = X_val[selected_features]
X_test = pd.read_csv('test.csv')[selected_features] # load test set as well

X_test.head()

Unnamed: 0,Exterior2nd,BsmtUnfSF,BsmtFullBath,GarageFinish,CentralAir,YrSold,SaleCondition,ExterQual,PoolArea,WoodDeckSF,...,BsmtExposure,BsmtFinSF1,TotalBsmtSF,KitchenQual,GarageCars,YearBuilt,LotArea,2ndFlrSF,GrLivArea,OverallQual
0,VinylSd,270.0,0.0,Unf,Y,2010,Normal,TA,0,140,...,No,468.0,882.0,TA,1.0,1961,11622,0,896,5
1,Wd Sdng,406.0,0.0,Unf,Y,2010,Normal,TA,0,393,...,No,923.0,1329.0,Gd,1.0,1958,14267,0,1329,6
2,VinylSd,137.0,0.0,Fin,Y,2010,Normal,TA,0,212,...,No,791.0,928.0,TA,2.0,1997,13830,701,1629,5
3,VinylSd,324.0,0.0,Fin,Y,2010,Normal,TA,0,360,...,No,602.0,926.0,Gd,2.0,1998,9978,678,1604,6
4,HdBoard,1017.0,0.0,RFn,Y,2010,Normal,Gd,0,0,...,No,263.0,1280.0,Gd,2.0,1992,5005,0,1280,8


In [7]:
pipe = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    XGBRegressor(n_jobs=-1)
)

pipe.fit(X_train_selected, y_train)
y_pred = pipe.predict(X_val_selected)
rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f"rmse: {rmse}") # without feature selection: 30721.90

rmse: 23949.730690130673


In [8]:
params = {
    'xgbregressor__n_estimators': range(50, 150, 20),
    'xgbregressor__max_depth': range(2, 6),
    'xgbregressor__learning_rate': [0.1, 0.2, 0.3, 0.5]
}

clf = GridSearchCV(
    pipe,
    param_grid=params,
    cv=3,
    scoring='neg_root_mean_squared_error',
    verbose=1,
    n_jobs=-1
)

clf.fit(X_train_selected, y_train)

print('최적 하이퍼파라미터: ', clf.best_params_)
print('Score: ', -clf.best_score_)

Fitting 3 folds for each of 80 candidates, totalling 240 fits
최적 하이퍼파라미터:  {'xgbregressor__learning_rate': 0.1, 'xgbregressor__max_depth': 4, 'xgbregressor__n_estimators': 110}
Score:  29232.62966386289


In [9]:
final_model =  clf.best_estimator_

y_pred = final_model.predict(X_val_selected)

rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f"rmse: {rmse}")

rmse: 24922.127345026624


# Predict test data and save

In [10]:
y_pred = final_model.predict(X_test)

submit = pd.DataFrame({'Id': pd.read_csv('sample_submission.csv')['Id'], 'SalePrice': y_pred})

submit.to_csv('submission_ver1.csv', index=False)