# House Prices - Advanced Regression Techniques 

## Preparation

In [29]:

import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd

DATA_PATH = "./data/"
SUBMISSIONS_PATH = "./submissions/"

# Importing the dataset
training = pd.read_csv(DATA_PATH + 'train.csv', index_col='Id')
testing = pd.read_csv(DATA_PATH + 'test.csv', index_col='Id')


def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log1p(y_true) - np.log1p(y_pred))))

## EDA

In [30]:
# check for na and its column name 
training_na = training.isna().sum()
testing_na = testing.isna().sum()
for i in range(len(training_na)):
    if training_na[i] > 0:
        print(training_na.index[i], training_na[i])

LotFrontage 259
Alley 1369
MasVnrType 872
MasVnrArea 8
BsmtQual 37
BsmtCond 37
BsmtExposure 38
BsmtFinType1 37
BsmtFinType2 38
Electrical 1
FireplaceQu 690
GarageType 81
GarageYrBlt 81
GarageFinish 81
GarageQual 81
GarageCond 81
PoolQC 1453
Fence 1179
MiscFeature 1406


  if training_na[i] > 0:
  print(training_na.index[i], training_na[i])


In [31]:
# print all the columns
# print(training.columns)
# numerical_features = training.select_dtypes(include=[np.number])
# print(numerical_features.columns.tolist())
# manually select numerical features
numerical_features = ['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']

# all other columns are categorical
categorical_features = [col for col in training.columns if col not in numerical_features and col != 'SalePrice']

print(categorical_features)


['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']


In [32]:
# check numerical nan values
print("Training")
print("=========================================")

for feature in numerical_features:
    if training[feature].isna().sum() > 0:
        print(feature, training[feature].isna().sum())

print("-----------------------------------------")
for feature in categorical_features:
    if training[feature].isna().sum() > 0:
        print(feature, training[feature].isna().sum())
        
print("\n\nTesting")
print("=========================================")

for feature in numerical_features:
    if testing[feature].isna().sum() > 0:
        print(feature, testing[feature].isna().sum())

print("-----------------------------------------")
for feature in categorical_features:
    if testing[feature].isna().sum() > 0:
        print(feature, testing[feature].isna().sum())

Training
LotFrontage 259
MasVnrArea 8
GarageYrBlt 81
-----------------------------------------
Alley 1369
MasVnrType 872
BsmtQual 37
BsmtCond 37
BsmtExposure 38
BsmtFinType1 37
BsmtFinType2 38
Electrical 1
FireplaceQu 690
GarageType 81
GarageFinish 81
GarageQual 81
GarageCond 81
PoolQC 1453
Fence 1179
MiscFeature 1406


Testing
LotFrontage 227
MasVnrArea 15
BsmtFinSF1 1
BsmtFinSF2 1
BsmtUnfSF 1
TotalBsmtSF 1
BsmtFullBath 2
BsmtHalfBath 2
GarageYrBlt 78
GarageCars 1
GarageArea 1
-----------------------------------------
MSZoning 4
Alley 1352
Utilities 2
Exterior1st 1
Exterior2nd 1
MasVnrType 894
BsmtQual 44
BsmtCond 45
BsmtExposure 44
BsmtFinType1 42
BsmtFinType2 42
KitchenQual 1
Functional 2
FireplaceQu 730
GarageType 76
GarageFinish 78
GarageQual 78
GarageCond 78
PoolQC 1456
Fence 1169
MiscFeature 1408
SaleType 1


In [33]:
# fill the GarageYrBlt with the YearBuilt
training['GarageYrBlt'] = training['GarageYrBlt'].fillna(training['YearBuilt'])
# for the numerical features, fill na with 0 
training[numerical_features] = training[numerical_features].fillna(0)
testing[numerical_features] = testing[numerical_features].fillna(0)

# for the categorical features, treat na as a new category
training[categorical_features] = training[categorical_features].fillna("NA")
testing[categorical_features] = testing[categorical_features].fillna("NA")

# check if there are any nan values left
print("Training")
print("=========================================")
print(training.isna().sum().sum())
print("\n\nTesting")
print("=========================================")
print(testing.isna().sum().sum())


Training
0


Testing
0


In [34]:
# one hot encode the categorical features using sklearn 
# treat unseen categories as NA 
from sklearn.preprocessing import OneHotEncoder

# fit the encoder on the training data
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(training[categorical_features])
# transform the training and testing data
training_encoded = encoder.transform(training[categorical_features])
testing_encoded = encoder.transform(testing[categorical_features])

# convert the sparse matrix to a pandas dataframe
training_encoded = pd.DataFrame(training_encoded.toarray(), columns=encoder.get_feature_names_out(categorical_features))
testing_encoded = pd.DataFrame(testing_encoded.toarray(), columns=encoder.get_feature_names_out(categorical_features))

# set their index to the original data 
training_encoded.index = training.index
testing_encoded.index = testing.index

# drop the original categorical columns
training = training.drop(columns=categorical_features)
testing = testing.drop(columns=categorical_features)

# concatenate the encoded columns
training = pd.concat([training, training_encoded], axis=1)
testing = pd.concat([testing, testing_encoded], axis=1)

In [35]:
# calculate the correlation matrix sort by absolute value
correlation_matrix = training.corr()
correlation_matrix = correlation_matrix['SalePrice'].sort_values(key=lambda x: abs(x), ascending=False)
for i in range(len(correlation_matrix)):
    print(correlation_matrix.index[i], correlation_matrix[i])


SalePrice 1.0
OverallQual 0.7909816005838053
GrLivArea 0.7086244776126515
GarageCars 0.6404091972583519
GarageArea 0.6234314389183622
TotalBsmtSF 0.6135805515591943
1stFlrSF 0.6058521846919153
ExterQual_TA -0.5890435234097585
FullBath 0.5606637627484453
BsmtQual_Ex 0.5531048470089394
TotRmsAbvGrd 0.5337231555820284
YearBuilt 0.5228973328794967
KitchenQual_TA -0.519297853654885
GarageYrBlt 0.5080432871615161
YearRemodAdd 0.5071009671113866
KitchenQual_Ex 0.504093675905297
Foundation_PConc 0.4977337525869374
MasVnrArea 0.4726144990045735
FireplaceQu_NA -0.4719080685164943
Fireplaces 0.46692883675152763
ExterQual_Gd 0.4524661278447931
BsmtQual_TA -0.4523935323501028
ExterQual_Ex 0.4511643302227566
BsmtFinType1_GLQ 0.43459734688277624
HeatingQC_Ex 0.43454323853246873
GarageFinish_Fin 0.41967796781801636
GarageFinish_Unf -0.4106083112916732
Neighborhood_NridgHt 0.4021485981752677
BsmtFinSF1 0.3864198062421535
MSSubClass_60 0.3771970684281052
MasVnrType_NA -0.36745636519324876
SaleType_New 0

  print(correlation_matrix.index[i], correlation_matrix[i])


In [37]:
# build a simple random forest model and print feature importance 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X = training.drop(columns=['SalePrice'])
y = training['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(rmsle(y_test, y_pred))

feature_importances = model.feature_importances_
feature_importances = pd.Series(feature_importances, index=X.columns)
feature_importances = feature_importances.sort_values(ascending=False)
for i in range(len(feature_importances)):
    print(feature_importances.index[i], feature_importances[i])

0.15397593978831434
OverallQual 0.5579222706719996
GrLivArea 0.1216423452147259
2ndFlrSF 0.034936151984660505
TotalBsmtSF 0.034902040243386466
BsmtFinSF1 0.027982566780584764
1stFlrSF 0.02674684436299004
LotArea 0.01750864544253766
GarageArea 0.015367638966453271
YearBuilt 0.012386311601493104
GarageCars 0.012072741227534496
LotFrontage 0.0074189094306328605
YearRemodAdd 0.007058134790247763
TotRmsAbvGrd 0.006251334507398406
FullBath 0.006067406191395254
OpenPorchSF 0.005961848823922781
BsmtQual_Ex 0.005266413427012975
GarageYrBlt 0.005161930336127442
BsmtUnfSF 0.004952120638482053
WoodDeckSF 0.0040096015612345855
KitchenQual_Gd 0.0038963379514183544
OverallCond 0.003355270641385652
MoSold 0.0031971736398335175
ScreenPorch 0.003146113824614348
MasVnrArea 0.0030660080516014725
BsmtQual_Gd 0.0027667155662304565
CentralAir_Y 0.0022416333845198654
GarageType_Detchd 0.0018739419271422933
GarageType_Attchd 0.0016972495603768064
Fireplaces 0.0015744302011589364
CentralAir_N 0.0015585090054039

  print(feature_importances.index[i], feature_importances[i])


In [7]:
# min max scale the numerical features
# from sklearn.preprocessing import MinMaxScaler

# scaler = MinMaxScaler()
# training[numerical_features] = scaler.fit_transform(training[numerical_features])
# testing[numerical_features] = scaler.transform(testing[numerical_features])


In [8]:
# # train an autogluon model and report its accurancy using rmsle
# from autogluon.tabular import TabularPredictor

# predictor = TabularPredictor(label='SalePrice').fit(training)
# predictions = predictor.predict(testing)

# # save the predictions to a csv file
# predictions.to_csv(SUBMISSIONS_PATH + 'autogluon_noscaling.csv')

No path specified. Models will be saved in: "AutogluonModels/ag-20240430_011751"
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240430_011751"
AutoGluon Version:  1.0.0
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Oct 5 21:02:42 UTC 2023
CPU Count:          16
Memory Avail:       15.54 GB / 19.22 GB (80.8%)

[1000]	valid_set's rmse: 30207.2
[2000]	valid_set's rmse: 29300.7
[3000]	valid_set's rmse: 29142.1
[4000]	valid_set's rmse: 29120.2
[5000]	valid_set's rmse: 29123.6


	-29120.0993	 = Validation score   (-root_mean_squared_error)
	7.67s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's rmse: 30041.9
[2000]	valid_set's rmse: 29444.3
[3000]	valid_set's rmse: 29331.9
[4000]	valid_set's rmse: 29310.8
[5000]	valid_set's rmse: 29308.7
[6000]	valid_set's rmse: 29308.1
[7000]	valid_set's rmse: 29308.3


	-29308.0155	 = Validation score   (-root_mean_squared_error)
	9.78s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: RandomForestMSE ...
	-33850.7269	 = Validation score   (-root_mean_squared_error)
	1.2s	 = Training   runtime
	0.11s	 = Validation runtime
Fitting model: CatBoost ...
	-29138.189	 = Validation score   (-root_mean_squared_error)
	35.76s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-32733.152	 = Validation score   (-root_mean_squared_error)
	1.1s	 = Training   runtime
	0.12s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-50688.2046	 = Validation score   (-root_mean_squared_error)
	3.62s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: XGBoost ...
	-32586.3257	 = Validation score   (-root_mean_squared_error)
	3.15s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	-35024.6622	 = Validation score   (-root_mean_squared_error)
	4.59s	 = Training   runtime
