# Kaggel House Prices Project

In this project I'll build a preprocessing pipeline and apply the preprocessed data to an extreme gradient boost regressor in order to forecast house sales prices based on selected features.

In [1]:
# Import all the relevant libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

In [37]:
# Read the data
X_full = pd.read_csv('/Users/Jonas/Desktop/DataScience/Kaggle/Housing/CSVs/train.csv', index_col='Id')
X_test_full = pd.read_csv('/Users/Jonas/Desktop/DataScience/Kaggle/Housing/CSVs/test.csv', index_col='Id')

In [38]:
# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

In [39]:
# Furthermore we want to drop any categorical columns with missing value
# Get all categorical columns
cat_cols = [col for col in X_full if X_full[col].dtype == 'object']

# Get all categorical columns with missing values (IMPORTANT: for both train and test data!)
cat_with_missing = [col for col in cat_cols if (X_full[col].isnull().any() or X_test_full[col].isnull().any())]

In [40]:
# Drop the categoricals with missing values
X_full.drop(cat_with_missing, axis=1, inplace=True)
X_test_full.drop(cat_with_missing, axis=1, inplace=True)

In [41]:
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X_full, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [42]:
# Reassign the categorical columns remaining (to be used for OH Encoding)
cat_cols = [col for col in X_train if X_train[col].dtype == 'object']

In [43]:
# Investigate cardinality by getting number of unique entries in each column with categorical data
unique_vals = list(map(lambda col: X_train[col].nunique(), cat_cols))
d = dict(zip(cat_cols, unique_vals))
sorted_d = sorted(d.items(), key=lambda x: x[1])
sorted_d

[('Street', 2),
 ('CentralAir', 2),
 ('LandSlope', 3),
 ('PavedDrive', 3),
 ('LotShape', 4),
 ('LandContour', 4),
 ('ExterQual', 4),
 ('LotConfig', 5),
 ('BldgType', 5),
 ('ExterCond', 5),
 ('HeatingQC', 5),
 ('Condition2', 6),
 ('RoofStyle', 6),
 ('Foundation', 6),
 ('Heating', 6),
 ('SaleCondition', 6),
 ('RoofMatl', 7),
 ('HouseStyle', 8),
 ('Condition1', 9),
 ('Neighborhood', 25)]

In [44]:
# Get lists of low and high cardinality categorical columns
low_cardinality = [col for col in cat_cols if X_train[col].nunique() < 10]
high_cardinality = list(set(cat_cols) - set(low_cardinality))

In [63]:
# Apply OH Encoder to each column with categorical data that doesn't have missing values
oh_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
oh_train = pd.DataFrame(oh_encoder.fit_transform(X_train[low_cardinality]))
oh_valid = pd.DataFrame(oh_encoder.transform(X_valid[low_cardinality]))
oh_test = pd.DataFrame(oh_encoder.transform(X_test_full[low_cardinality]))

In [64]:
# Put the indices back
oh_train.index = X_train.index
oh_valid.index = X_valid.index
oh_test.index = X_test_full.index

In [65]:
# Remove the categorical values (will later add the OH columns instead)
X_train_num = X_train.drop(cat_cols, axis=1)
X_valid_num = X_valid.drop(cat_cols, axis=1)
X_test_num = X_test_full.drop(cat_cols, axis=1)

In [66]:
# Impute the numerical columns
imp = SimpleImputer()
imputed_X_train = pd.DataFrame(imp.fit_transform(X_train_num))
imputed_X_valid = pd.DataFrame(imp.transform(X_valid_num))
imputed_X_test = pd.DataFrame(imp.transform(X_test_num))

In [67]:
# Imputation removed column names; put them back
imputed_X_train.columns = X_train_num.columns
imputed_X_valid.columns = X_valid_num.columns
imputed_X_test.columns = X_test_num.columns

In [68]:
# Put the indices back (otherwise the concat later will produce NaNs where the indices don't match)
imputed_X_train.index = X_train_num.index
imputed_X_valid.index = X_valid_num.index
imputed_X_test.index = X_test_num.index

In [75]:
# Put the OH Encoded columns and the imputed columns back together in one DataFrame
X_train_final = pd.concat([imputed_X_train, oh_train], axis=1)
X_valid_final = pd.concat([imputed_X_valid, oh_valid], axis=1)
X_test_final = pd.concat([imputed_X_test, oh_test], axis=1)

#X_train_final = X_train_final.select_dtypes(exclude=['object'])

In [76]:
# Define the model
xgb = XGBRegressor(n_estimators=1000, learning_rate=0.05)

In [77]:
# Fit the model
xgb.fit(X_train_final, y_train, 
             early_stopping_rounds=5, 
             eval_set=[(X_valid_final, y_valid)], 
             verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.05, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [78]:
# Predict validation set and score the predictions
preds = xgb.predict(X_valid_final)

score = mean_absolute_error(y_valid, preds)
score

17198.30079462757

In [80]:
# Make predictions on the test set
preds_test = xgb.predict(X_test_final)

In [81]:
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
output.to_csv('/Users/Jonas/Desktop/DataScience/Kaggle/Housing/XGB.csv', index=False)