# Housing Prices Competition

In [1]:
# Set up code checking
import os
if not os.path.exists("../input/train.csv"):
    os.symlink("../input/home-data-for-ml-course/train.csv", "../input/train.csv")  
    os.symlink("../input/home-data-for-ml-course/test.csv", "../input/test.csv") 
from learntools.core import binder
binder.bind(globals())
from learntools.ml_intermediate.ex2 import *
print("Setup Complete")

Setup Complete


## 1. 전처리

import pandas as pd
from sklearn.model_selection import train_test_split

X_full = pd.read_csv('../input/train.csv', index_col='Id')
X_test_full = pd.read_csv('../input/test.csv', index_col='Id')

X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

X = X_full.select_dtypes(exclude=['object'])
X_test = X_test_full.select_dtypes(exclude=['object'])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [3]:
print(X_train.shape)

(1168, 36)


In [5]:
from sklearn.impute import SimpleImputer
my_imputer=SimpleImputer()
final_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
final_X_valid = pd.DataFrame(my_imputer.transform(X_valid))
final_X_train.columns = X_train.columns
final_X_valid.columns = X_valid.columns

## 2. 모델 학습

In [23]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error

param = {
    'n_estimators': [800, 1000, 1200],
    'max_depth': [5, 6],
    'learning_rate': [0.005, 0.01, 0.03],
    'reg_lambda': [10,20,50,100],
    'reg_alpha': [0.1, 1, 5, 10],
    'subsample': [0.7, 0.8],
    'colsample_bytree': [0.7, 0.8]
}

xgb = XGBRegressor(tree_method='hist', device = 'cuda' , random_state=42)

random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param,
    n_iter=50,  
    cv=5 ,     
    scoring='neg_mean_absolute_error',
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(final_X_train, y_train)

model = random_search.best_estimator_

preds_valid = model.predict(final_X_valid)
print(mean_absolute_error(y_valid, preds_valid))

Fitting 5 folds for each of 50 candidates, totalling 250 fits
17074.574780607876


## 3. 모델 평가

In [25]:
print(model.score(final_X_train, y_train))
print(random_search.best_params_)
print(random_search.best_score_)

0.9730627605361916
{'subsample': 0.7, 'reg_lambda': 10, 'reg_alpha': 5, 'n_estimators': 1000, 'max_depth': 5, 'learning_rate': 0.01, 'colsample_bytree': 0.7}
-16258.304953505009


In [26]:
imputer=SimpleImputer()
final_X_test = pd.DataFrame(imputer.fit_transform(X_test), columns=X_test.columns, index=X_test.index)
preds_test = model.predict(final_X_test)

In [27]:
# Save test predictions to file
output = pd.DataFrame({'Id': final_X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission12.csv', index=False)

## 4. 최종 Test 예측 결과

15127.51213점  (359/6073 등)

위 분석을 하면서 SimpleImputer를 통한 결측값 처리, 랜덤서치 등을 활용할 수 있게 되었다. 
랜덤서치는 xgboost로 진행하였고, 과대적합과 과소적합을 고려해서 탐색 파라미터 범위를 바꿔보며 모델의 일반화 성능을 올렸다.   
이 경험이 앞으로 데이터 전처리와 하이퍼파라미터 튜닝에 도움이 될 것으로 예상한다.
앞으로 eda를 통한 피쳐 중요도파악, 검증집합 그래프, 앙상블 기법 등을 시도하여 모델의 성능을 더 향상할 수 있을 것으로 예상한다.