In [1]:
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBRegressor

In [2]:
# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10)

warnings.filterwarnings('ignore')

In [3]:
def load_data():
    df_train = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv", index_col="Id")
    df_test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv", index_col="Id")
    df = pd.concat([df_train, df_test])
    # Reform splits
    df_train = df.loc[df_train.index, :]
    df_test = df.loc[df_test.index, :]
    return df_train, df_test

In [4]:
df_train, df_test = load_data()

In [5]:
def preprocess(df):
    categorical_cols = ["LotConfig","LotArea","LandSlope","Neighborhood","Condition1",
                   "Condition2","BldgType","HouseStyle","RoofStyle","RoofMatl",
                   "Exterior1st","Exterior2nd","MasVnrType","ExterQual","ExterCond",
                    "Foundation","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1",
                   "BsmtFinSF1","BsmtFinType2","BsmtFinSF2","BsmtUnfSF",
                    "TotalBsmtSF","Heating","HeatingQC","CentralAir","Utilities",
                    "Electrical","KitchenQual","Functional","PavedDrive","SaleType",
                    "GarageType","GarageFinish","GarageQual","GarageCond",
                    "SaleCondition","MSZoning","LotShape","Street","LandContour"]
    df['MasVnrType'].fillna('None', inplace=True)
    df['BsmtQual'].fillna('None', inplace=True)
    df['BsmtCond'].fillna('None', inplace=True)
    df['BsmtExposure'].fillna('None', inplace=True)
    df['BsmtFinType1'].fillna('None', inplace=True)
    df['BsmtFinType2'].fillna('None', inplace=True)
    df['Electrical'].fillna('None', inplace=True)
    df['GarageType'].fillna('None', inplace=True)
    df['GarageFinish'].fillna('None', inplace=True)
    df['GarageQual'].fillna('None', inplace=True)
    df['GarageCond'].fillna('None', inplace=True)
    df['Exterior1st'].fillna('None', inplace=True)
    df['Exterior2nd'].fillna('None', inplace=True)
    df['Utilities'].fillna('None', inplace=True)
    df['Electrical'].fillna('None', inplace=True)
    df['KitchenQual'].fillna('None', inplace=True)
    df['Functional'].fillna('None', inplace=True)
    df['SaleType'].fillna('None', inplace=True)
    df['MSZoning'].fillna('None', inplace=True)
    df['LotArea'] = np.log1p(df['LotArea'])
    df['LotFrontage'].fillna(np.mean(df['LotFrontage']), inplace=True)
    df['MasVnrArea'].fillna(0, inplace=True)
    df['GarageYrBlt'].fillna(df['YearBuilt'], inplace=True)
    
    for col in categorical_cols:
        encoder = LabelEncoder()
        df[col] = encoder.fit_transform(df[col])

    df['MasVnrArea'] = df['MasVnrArea'].astype(int)
    return df

In [6]:
df_test = preprocess(df_test)
df_train = preprocess(df_train)

In [7]:
df_test = df_test.drop(columns='SalePrice')

In [8]:
null_dominant_cols = ["Alley","PoolQC","Fence","MiscFeature","FireplaceQu"]
df_train.drop(columns=null_dominant_cols, inplace=True)
df_test.drop(columns=null_dominant_cols, inplace=True)

In [9]:
X = df_train.copy()
y = X.pop("SalePrice")
log_y = np.log(y)

model = XGBRegressor(max_depth=40,
                     min_child_samples=100,
                     metric='rmse',
                     verbosity=3,
                     random_state=0)

model.fit(X, log_y)

Parameters: { "metric", "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[08:56:21] DEBUG: ../src/gbm/gbtree.cc:155: Using tree method: 2
[08:56:21] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 0 pruned nodes, max_depth=0
[08:56:21] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 2 extra nodes, 0 pruned nodes, max_depth=1
[08:56:21] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 2 extra nodes, 0 pruned nodes, max_depth=1
[08:56:21] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 4 extra nodes, 0 pruned nodes, max_depth=2
[08:56:21] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 8 extra nodes, 0 pruned nodes, max_depth=3
[08:56:21] INFO: ../src/tree/updater_prun

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=40, metric='rmse',
             min_child_samples=100, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=3)

In [10]:
predictions = model.predict(df_test)

In [11]:
output = pd.DataFrame({'Id': df_test.index, 'SalePrice': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
