In [26]:
# code block to import all packages used throughout
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor

In [27]:
# load in and set up data

X_train = pd.read_csv('train.csv', index_col = 'Id')
X_test = pd.read_csv('test.csv', index_col = 'Id')

Y_train = X_train.pop('SalePrice')
# Process the data all together to deal with missing values
X_full = pd.concat([X_train, X_test])

X_full.dtypes.unique()

array([dtype('int64'), dtype('O'), dtype('float64')], dtype=object)

In [28]:
cat_cols = X_full.select_dtypes(include='O').columns.tolist() #catagorical
num_cols = [x for x in X_full.columns if x not in cat_cols] # numeric
null_cols = X_full.columns[X_full.isna().any()].tolist() # null values
many_nulls = X_full.columns[X_full.isna().sum() > (len(X_full) // 25)] # lots of nulls
X_full[many_nulls].dtypes # data types of the null columns

LotFrontage     float64
Alley            object
MasVnrType       object
FireplaceQu      object
GarageType       object
GarageYrBlt     float64
GarageFinish     object
GarageQual       object
GarageCond       object
PoolQC           object
Fence            object
MiscFeature      object
dtype: object

In [29]:
# Deal with null values
X_full['GarageYrBlt'] = X_full['GarageYrBlt'].fillna(0) # means no garage
X_full['LotFrontage'] = X_full['LotFrontage'].fillna(0) # deal with these shortly
for col in null_cols:
    if col in cat_cols:
        X_full[col] = X_full[col].fillna('None') # always means doesn't apply so fill with new type
    else:
        X_full[col] = X_full[col].fillna(X_full[col].median()) # fill numerics with median

X_full.columns[X_full.isna().any()] # check all nulls are filled


Index([], dtype='object')

In [30]:
# Create a copy of the original DataFrame for encoding
X_encoded = X_full.copy()

# Initialize OrdinalEncoder
encoder = OrdinalEncoder()

# Fit and transform only the categorical columns
X_encoded[cat_cols] = encoder.fit_transform(X_full[cat_cols])

X_encoded.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,4.0,65.0,8450,1.0,1.0,3.0,3.0,0.0,4.0,...,0,0,3.0,4.0,1.0,0,2,2008,9.0,4.0
2,20,4.0,80.0,9600,1.0,1.0,3.0,3.0,0.0,2.0,...,0,0,3.0,4.0,1.0,0,5,2007,9.0,4.0
3,60,4.0,68.0,11250,1.0,1.0,0.0,3.0,0.0,4.0,...,0,0,3.0,4.0,1.0,0,9,2008,9.0,4.0
4,70,4.0,60.0,9550,1.0,1.0,0.0,3.0,0.0,0.0,...,0,0,3.0,4.0,1.0,0,2,2006,9.0,0.0
5,60,4.0,84.0,14260,1.0,1.0,0.0,3.0,0.0,2.0,...,0,0,3.0,4.0,1.0,0,12,2008,9.0,4.0


In [31]:
# attempt to deal with lotfrontage values by imputation
# found through EDA that it is worth predicting the values rather than just assuming 0
# na values do not seem to suggest that there is no lot frontage here

X_lot_vals = X_encoded.loc[X_encoded['LotFrontage'] == 0].drop(columns = ['LotFrontage'])
X_lot_train = X_encoded.loc[X_encoded['LotFrontage'] != 0]
Y_lot = X_lot_train.pop('LotFrontage')

pipe = Pipeline(steps = [("prep", StandardScaler()), ("model", XGBRegressor())])

pipe.fit(X_lot_train, Y_lot)
lot_pred = pipe.predict(X_lot_vals)

zero_lot_frontage_indices = X_full[X_full['LotFrontage'] == 0].index
for i, index in enumerate(zero_lot_frontage_indices):
    X_full.loc[index, 'LotFrontage'] = lot_pred[i]
X_full['LotFrontage'][1:5]

# now we have X_full with no na values

Id
2    80.0
3    68.0
4    60.0
5    84.0
Name: LotFrontage, dtype: float64

In [None]:
# correlations of predictors
corr_matrix = pd.DataFrame(X_full[num_cols].corr())

# find highest correlations
def multicollinearity(matrix):
    threshold_pos = 0.65
    threshold_neg = -0.65
    high_corr_positive = (matrix > threshold_pos) & (matrix < 1.0)
    high_corr_negative = (matrix < threshold_neg) & (matrix > -1.0)
    high_corr_pairs = high_corr_positive | high_corr_negative
    pairs_with_high_corr = [(i, j, matrix.loc[i, j]) for i in high_corr_pairs.columns for j in high_corr_pairs.index if high_corr_pairs.loc[i, j]]

    for pair in pairs_with_high_corr:
        print(f"Features: {pair[0]}, {pair[1]} - Correlation: {pair[2]}")

multicollinearity(corr_matrix)

Features: TotalBsmtSF, 1stFlrSF - Correlation: 0.8016377464888657
Features: 1stFlrSF, TotalBsmtSF - Correlation: 0.8016377464888657
Features: 2ndFlrSF, GrLivArea - Correlation: 0.6550846935436292
Features: GrLivArea, 2ndFlrSF - Correlation: 0.6550846935436292
Features: GrLivArea, TotRmsAbvGrd - Correlation: 0.8083544205418535
Features: BedroomAbvGr, TotRmsAbvGrd - Correlation: 0.6697372307298517
Features: TotRmsAbvGrd, GrLivArea - Correlation: 0.8083544205418535
Features: TotRmsAbvGrd, BedroomAbvGr - Correlation: 0.6697372307298517
Features: GarageCars, GarageArea - Correlation: 0.8896894076828877
Features: GarageArea, GarageCars - Correlation: 0.8896894076828877


In [None]:
# simplify features mainly based on common sense and high correlations to avoid overfitting
X_full['SF'] = X_full['1stFlrSF'] + X_full['TotalBsmtSF'] + X_full['2ndFlrSF'] + X_full['GrLivArea']
X_full['Garage'] = X_full['GarageArea'] + 0.2 * X_full['GarageCars']
X_full['Rooms'] = X_full['TotRmsAbvGrd'] + 0.4 * X_full['BedroomAbvGr']

# Drop the original columns
X_full = X_full.drop(['GrLivArea', '1stFlrSF', '2ndFlrSF', 'TotalBsmtSF', 'GarageArea', 'GarageCars', 'TotRmsAbvGrd', 'BedroomAbvGr'], axis=1)
X_full.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SF,Garage,Rooms
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,,,0,2,2008,WD,Normal,4276.0,548.4,9.2
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,,,0,5,2007,WD,Normal,3786.0,460.4,7.2
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,,,0,9,2008,WD,Normal,4492.0,608.4,7.2
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,,,0,2,2006,WD,Abnorml,4190.0,642.6,8.2
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,,,0,12,2008,WD,Normal,5541.0,836.6,10.6


In [34]:
# update cat cols and num cols
cat_cols = X_full.select_dtypes(include='O').columns.tolist()
num_cols = [x for x in X_full.columns if x not in cat_cols]
corr_matrix1 = pd.DataFrame(X_full[num_cols].corr())
multicollinearity(corr_matrix1)

Features: OverallQual, SF - Correlation: 0.6513028329148756
Features: SF, OverallQual - Correlation: 0.6513028329148756
Features: SF, Rooms - Correlation: 0.720167809983438
Features: Rooms, SF - Correlation: 0.720167809983438


In [35]:
# clearly not worth combining any more, so we stop there
X_full.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SF,Garage,Rooms
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,,,0,2,2008,WD,Normal,4276.0,548.4,9.2
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,,,0,5,2007,WD,Normal,3786.0,460.4,7.2
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,,,0,9,2008,WD,Normal,4492.0,608.4,7.2
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,,,0,2,2006,WD,Abnorml,4190.0,642.6,8.2
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,,,0,12,2008,WD,Normal,5541.0,836.6,10.6


In [36]:
# apply encoding as we did earlier and sepearate the data again

X_full[cat_cols] = encoder.fit_transform(X_full[cat_cols])
X_train_enc = X_full.loc[X_train.index]
X_test_enc = X_full.loc[X_test.index]

X_test_enc.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SF,Garage,Rooms
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,3.0,80.0,11622,1.0,1.0,3.0,3.0,0.0,4.0,...,2.0,1.0,0,6,2010,9.0,4.0,2674.0,730.2,5.8
1462,20,4.0,81.0,14267,1.0,1.0,0.0,3.0,0.0,0.0,...,4.0,0.0,12500,6,2010,9.0,4.0,3987.0,312.2,7.2
1463,60,4.0,74.0,13830,1.0,1.0,0.0,3.0,0.0,4.0,...,2.0,1.0,0,3,2010,9.0,4.0,4186.0,482.4,7.2
1464,60,4.0,78.0,9978,1.0,1.0,0.0,3.0,0.0,4.0,...,4.0,1.0,0,6,2010,9.0,4.0,4134.0,470.4,8.2
1465,120,4.0,43.0,5005,1.0,1.0,0.0,1.0,0.0,4.0,...,4.0,1.0,0,1,2010,9.0,4.0,3840.0,506.4,5.8


In [None]:
# set up parameters
p_grid = {
    'model__n_estimators': [100, 150, 200],
    'model__max_depth': [4, 5, 6],
    'model__learning_rate': [0.1, 0.05, 0.01],
}

# using cross value scores, search for the optimal parameters
grid_search = GridSearchCV(pipe, param_grid=p_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(X_train_enc, Y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_params

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [None]:
# use the optimal parameters to predict the house prices
predictions = best_model.predict(X_test_enc)
# save output ready for submission
output = pd.DataFrame()
output['Id'] = X_test.index
output['SalePrice'] = predictions
output.to_csv('submission.csv', index=False)