In [1]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print(train.shape, test.shape)


(1460, 81) (1459, 80)


In [2]:
train_ID = train['Id']
test_ID = test['Id']
y = train['SalePrice']

train.drop(['Id', 'SalePrice'], axis=1, inplace=True)
test.drop('Id', axis=1, inplace=True)

# Combine for preprocessing
all_data = pd.concat([train, test], axis=0).reset_index(drop=True)



In [3]:
# Fill 'None' for categorical NAs
none_cols = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 
             'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
             'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
             'BsmtFinType2', 'MasVnrType']
for col in none_cols:
    all_data[col] = all_data[col].fillna('None')

# Fill 0 for numerical where NA means absence
zero_cols = ['GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 
             'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 
             'BsmtHalfBath', 'MasVnrArea']
for col in zero_cols:
    all_data[col] = all_data[col].fillna(0)

# LotFrontage by Neighborhood median
all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(
    lambda x: x.fillna(x.median()))

# Drop this feature due to redundancy
all_data.drop(['Utilities'], axis=1, inplace=True)


In [4]:
# Convert numerical to categorical
cat_convert = ['MSSubClass', 'MoSold', 'YrSold', 'OverallCond']
for col in cat_convert:
    all_data[col] = all_data[col].astype(str)


In [12]:
from sklearn.preprocessing import LabelEncoder

ordinal_features = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 
                    'HeatingQC', 'KitchenQual', 'FireplaceQu', 
                    'GarageQual', 'GarageCond', 'PoolQC']

for col in ordinal_features:
    lbl = LabelEncoder()
    all_data[col] = lbl.fit_transform(all_data[col].astype(str))


In [13]:
# Total square footage
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

# Total bathrooms
all_data['TotalBathrooms'] = (all_data['FullBath'] + 0.5 * all_data['HalfBath'] +
                               all_data['BsmtFullBath'] + 0.5 * all_data['BsmtHalfBath'])

# Porch area
all_data['TotalPorchSF'] = (all_data['OpenPorchSF'] + all_data['EnclosedPorch'] +
                            all_data['3SsnPorch'] + all_data['ScreenPorch'])

# Flags
all_data['HasPool'] = all_data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
all_data['Has2ndFloor'] = all_data['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)


In [14]:
all_data = pd.get_dummies(all_data)


In [15]:
X_train = all_data[:len(y)]
X_test = all_data[len(y):]


In [17]:
!pip install xgboost



Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB 393.8 kB/s eta 0:06:21
   ---------------------------------------- 0.4/150.0 MB 3.1 MB/s eta 0:00:49
   ---------------------------------------- 1.0/150.0 MB 6.3 MB/s eta 0:00:24
   ---------------------------------------- 1.7/150.0 MB 8.5 MB/s eta 0:00:18
    --------------------------------------- 2.4/150.0 MB 9.6 MB/s eta 0:00:16
    --------------------------------------- 3.2/150.0 MB 10.6 MB/s eta 0:00:14
   - -------------------------------------- 3.8/150.0 MB 11.1 MB/s eta 0:00:14
   - -------------------------------------- 4.4/150.0 MB 11.6 MB/s eta 0:00:13
   - -------------------------------------- 4.6/150.0 MB 10.9 MB/s eta 0:00:14
  

In [18]:
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
import numpy as np

model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
score = cross_val_score(model, X_train, y, scoring="neg_root_mean_squared_error", cv=5)

print("XGBoost CV RMSE:", -np.mean(score))

# Train on full training set
model.fit(X_train, y)


XGBoost CV RMSE: 28101.2


0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [19]:
preds = model.predict(X_test)

submission = pd.DataFrame({
    'Id': test_ID,
    'SalePrice': preds
})

submission.to_csv('submission.csv', index=False)
print("Submission file created!")


Submission file created!
