In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# get the train and test csv
X = pd.read_csv('../input/home-data-for-ml-course/train.csv', index_col='Id')
X_test = pd.read_csv('../input/home-data-for-ml-course/test.csv', index_col='Id')

y = X.pop('SalePrice')

X.head()

In [None]:
from sklearn.model_selection import train_test_split
# Break off validation set from training data for xgb
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

In [None]:
X.info()

In [None]:
# get all the sales prices
y

In [None]:
# find low cardinality categorical columns
low_card_cols = [col for col in X.columns if X[col].nunique() < 15 and X[col].dtype == 'object']
low_card_cols = set(low_card_cols)
low_card_cols

In [None]:
# get the numerical columns
numerical_cols = set([col for col in X.columns if X[col].dtype in ['int64', 'float64']])
numerical_cols

In [None]:
# look at the numerical cols with garage
garage_cols = [col for col in X.columns if col.startswith('Garage')]
X[garage_cols].describe()

In [None]:
# Look at garage area
zero_area = X[X['GarageArea'] == 0]
zero_area[garage_cols].head(10)

# get the nuercial garage columns
numerical_garage = ['GarageYrBlt', 'GarageCars', 'GarageArea']

# consider: type, finishm cond, qual -> all caterogrical, many NaN

In [None]:
# get the object garabe cols and add them to many nulls (below)
obj_garage_cols = set(garage_cols) - set(numerical_garage)
print(obj_garage_cols)

In [None]:
# remove the year the garage was built in. The other garage info wil be enough and there are plenty of null values here
numerical_cols = list(numerical_cols - {'GarageYrBlt'})
numerical_cols

In [None]:
# find null columns
nulls = X.isna().sum()[X.isna().sum() > 0]
many_nulls_list = nulls[nulls > 100]
print(many_nulls_list)

"""
LotFrontage: Linear feet of street connected to property
most likely data not avaliable

Alley: Type of alley access to property
Makes sense to input as a constant value of none

FireplaceQu: Fireplace quality
Many NA fireplaces -> null

PoolQC: Pool quality
Many NA pools -> null

Fence: Fence Quality
Many NA fences- > null

MiscFeature: Miscellaneous feature not covered in other categories
Many NA (none) other features -> null

"""

In [None]:
# get the set of values in columns with many unique values (the imputers and data transformers on these columns may impact results)
many_null_cols = [col for col in many_nulls_list.index]
many_null_col_vals = [set(X[col].dropna()) for col in many_null_cols]
many_nulls = dict(zip(many_null_cols, many_null_col_vals))
many_nulls

In [None]:
# remove the basement null cols
replace_null_cols = [col for col in low_card_cols if X[col].isna().sum() > 10]
replace_null_cols

In [None]:
# add the columns with many nulls to the garage categorical cols to get the final columns with many nulls to replace
# replace_null_cols = many_null_cols[1:] + list(obj_garage_cols)
# replace_null_cols 

In [None]:
# check categorical transformer for many null cols and drop
low_card_cols = list(low_card_cols - set(replace_null_cols))
low_card_cols

In [None]:
X[low_card_cols].isna().sum()
# removed all columns that come in empty from categorical cols (all in null cols now)

In [None]:
# explore the columns that start with lot to see if lot frontage can be removed
lots = [col for col in X.columns if col.startswith('Lot')]
lot_df = X[lots].sort_values(by='LotFrontage', ascending=False)
lot_df.head(20)

In [None]:
# plot lot frontage vs lot area
import seaborn as sns
ax = sns.regplot(x=lot_df['LotFrontage'], y=lot_df['LotArea'])
ax.set_title('Lot frontage vs area')

In [None]:
# calculate pearson correlation
pearsoncorr = lot_df.corr(method='pearson')
pearsoncorr

# consider dropping lot frontage since there is moderate correlation with lot area

In [None]:
# change the feature data to only include the columns for low cardinality categorical columns and numbers
X_clean = X[numerical_cols + low_card_cols + replace_null_cols].copy()
X_test_clean = X_test[numerical_cols + low_card_cols + replace_null_cols].copy()
X_train_unfit = X_train_full[numerical_cols + low_card_cols + replace_null_cols].copy()
X_valid_unfit = X_valid_full[numerical_cols + low_card_cols + replace_null_cols].copy()
X_clean.head()

In [None]:
print(X_valid_unfit.shape, X_train_unfit.shape)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.metrics import mean_absolute_error

# deal with numerical cols
numerical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

# deal with columns that have many null values
many_null_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy='constant')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

# deal with categorical columns. Test two strategies for the imputer
categorical_transformer_1 = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor_1 = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_cols),
        ('categorical', categorical_transformer_1, low_card_cols+replace_null_cols)
    ])

# ('many_null', many_null_transformer, replace_null_cols)

In [None]:
from sklearn.model_selection import cross_val_score
from typing import Union
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_absolute_error

def test_model(model: Union[RandomForestRegressor, xgb.XGBRegressor], preprocesser: ColumnTransformer) -> float:
  
    # check if xg boost is being used
    if type(model) != RandomForestRegressor:
        # fit the eval set before passing it to fit params
        X_val_eval = X_valid_unfit.copy()
        # add only preprocessor to the pipeline to process the X_val_eval
        eval_pipeline = Pipeline(steps = [('preprocess', preprocesser)])
        # fit the eval set to this new pipeline
        X_train = eval_pipeline.fit_transform(X_train_unfit)
        X_val_eval = eval_pipeline.transform(X_val_eval)
        
#         print(X_val_eval)

        fit_params = {'early_stopping_rounds': 20,
            'eval_metric': 'mae',
            'verbose': True,
            'eval_set': [(X_val_eval, y_valid)]}
        
        # use cross validation on 80:20 splits to check model accuracy using all data
        scores = -1 * cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error', fit_params=fit_params)
        
#         # fit the model 
#         model.fit(X_train, y_train, early_stopping_rounds=8, eval_set = [(X_val_eval, y_valid)])
        
#         # make predictions on the validation data
#         predictions = model.predict(X_val_eval)
        
#         # score the model using mae
#         scores = mean_absolute_error(predictions, y_valid)
        
    else:
        # create the pipeline for the model
        model_pipeline = Pipeline(steps = [
            ('preprocess', preprocesser),
            ('model', model)
        ])
    
        # use cross validation on 80:20 splits to check model accuracy using all data
        scores = -1 * cross_val_score(model_pipeline, X_clean, y, cv=5, scoring='neg_mean_absolute_error')
    
    return scores.mean()

In [None]:
# create a few random forest models
# model_1 = RandomForestRegressor(n_estimators=100, random_state=0)
# model_2 = RandomForestRegressor(n_estimators=200, random_state=0)
# model_3 = RandomForestRegressor(n_estimators=300, random_state=0)
# model_4 = RandomForestRegressor(n_estimators=400, random_state=0)
# model_5 = RandomForestRegressor(n_estimators=325, random_state=0)

# rf_list = [model_1, model_2, model_3, model_4, model_5]
# accuracy_1 = []
# for model in rf_list:
#     accuracy_1.append(test_model(model, preprocessor_1))

"""
Results
17582.88258904109
17545.992270547944
17532.8472283105
17562.180580479453
17559.728131506847

Run 2
17696.57582191781
17629.291688356163
17618.58353652968
17606.96818150685
17606.63126027397
"""

In [None]:
# num_estimators = [100, 200, 300, 400, 325]
# if accuracy_1 == None:
#     accuracy_1 = [17668.580184931507, 17595.95196232877, 17533.18764611872, 17550.457345890412, 17548.591466807167]
# # graph the estimators v accuracy
# sns.set_style('darkgrid')
# ax = sns.lineplot(x=num_estimators, y=accuracy_1)
# ax.set_title('Model Accuracy vs Num Trees')
# ax.set_xlabel('Num Trees')
# ax.set_ylabel('House Price Prediction Error ($)')

In [None]:
# model_6 = RandomForestRegressor(n_estimators=300, max_depth=25, random_state=0)
# model_7 = RandomForestRegressor(n_estimators=300, max_depth=40, random_state=0)
# model_8 = RandomForestRegressor(n_estimators=300, max_depth=60, random_state=0)


# accuracy_2 = []
# rf_list_2 = [model_6, model_7, model_8]
# for model in rf_list_2:
#     accuracy_2.append(test_model(model, preprocessor_1))
"""
Results
17552.264642781876
17545.47420156556
17573.30147162427
"""

In [None]:
# depth_cols = [25, 40, 60]
# ax_2 = sns.lineplot(x=depth_cols, y=accuracy_2)
# ax_2.set_title('Model Accuracy vs Tree Depth')
# ax_2.set_xlabel('Tree Depth')
# ax_2.set_ylabel('House Price Prediction Error ($)')

In [None]:
# model_9 = RandomForestRegressor(n_estimators=300, max_depth=40, max_features=10, random_state=0)
# model_10 = RandomForestRegressor(n_estimators=300, max_depth=40, max_features=20, random_state=0)
# model_11 = RandomForestRegressor(n_estimators=300, max_depth=40, max_features=35, random_state=0)
# model_12 = RandomForestRegressor(n_estimators=300, max_depth=40, max_features=50, random_state=0)


# accuracy_3 = []
# rf_list_3 = [model_9, model_10, model_11, model_12]
# for model in rf_list_3:
#     accuracy_3.append(test_model(model, preprocessor_1))

In [None]:
# model_13 = RandomForestRegressor(n_estimators=300, max_depth=40, max_features = 25, random_state=0)
# accuracy_3.append(test_model(model_13, preprocessor_1))

In [None]:
# model_13_2 = RandomForestRegressor(n_estimators=300, max_depth=40, max_features = 30, random_state=0)
# accuracy_3.append(test_model(model_13_2, preprocessor_1))

In [None]:
# max_feature_vals = [10, 20, 35, 50, 25, 30]
# ax_3 = sns.lineplot(x=max_feature_vals, y=accuracy_3)
# ax_3.set_title('Model Accuracy vs Max Features')
# ax_3.set_xlabel('Max Features')
# ax_3.set_ylabel('House Price Prediction Error ($)')

In [None]:
# model_14 = RandomForestRegressor(n_estimators=300, max_depth=40, max_features=30, min_samples_leaf=2, random_state=0)
# model_15 = RandomForestRegressor(n_estimators=300, max_depth=40, max_features=30, min_samples_leaf=5, random_state=0)
# model_16 = RandomForestRegressor(n_estimators=300, max_depth=40, max_features=30, min_samples_leaf=10, random_state=0)
# model_17 = RandomForestRegressor(n_estimators=300, max_depth=40, max_features=30, min_samples_leaf=20, random_state=0)


# accuracy_4 = []
# rf_list_4 = [model_14, model_15, model_16, model_17]
# for model in rf_list_4:
#     accuracy_4.append(test_model(model, preprocessor_1))

In [None]:
# min_sample_leaf_vals = [2, 5, 10, 20]
# ax_4 = sns.lineplot(x=min_sample_leaf_vals, y=accuracy_4)
# ax_4.set_title('Model Accuracy vs Min Sample Leaves')
# ax_4.set_xlabel('Min Sample Leaves')
# ax_4.set_ylabel('House Price Prediction Error ($)')

In [None]:
# model_18 = RandomForestRegressor(n_estimators=300, max_depth=40, min_samples_split=3, max_features=30, random_state=0)
# model_19 = RandomForestRegressor(n_estimators=300, max_depth=40, min_samples_split=5, max_features=30, random_state=0)
# model_20 = RandomForestRegressor(n_estimators=300, max_depth=40, min_samples_split=10, max_features=30, random_state=0)
# model_21 = RandomForestRegressor(n_estimators=300, max_depth=40, min_samples_split=20, max_features=30, random_state=0)


# accuracy_5 = []
# rf_list_5 = [model_18, model_19, model_20, model_21]
# for model in rf_list_5:
#     accuracy_5.append(test_model(model, preprocessor_1))

In [None]:
# min_samples_split_vals = [3, 5, 10, 20]
# ax_5 = sns.lineplot(x=min_samples_split_vals, y=accuracy_5)
# ax_5.set_title('Model Accuracy vs Min Sample Split')
# ax_5.set_xlabel('Min Sample Split')
# ax_5.set_ylabel('House Price Prediction Error ($)')

In [None]:
# try out the optimal rf, keep max_features at 25 to avoid over fitting
# rf_model = RandomForestRegressor(n_estimators=300, max_depth=40, max_features=25, random_state=0)
# # create the pipeline for the final model
# rf_pipeline = Pipeline(steps = [
#     ('preprocess', preprocessor_1),
#     ('model', rf_model)
# ])
# rf_pipeline.fit(X_clean, y)

# # make predictions with test data and score the model
# rf_preds_test = rf_pipeline.predict(X_test_clean)

In [None]:
# submit the rf model
# output = pd.DataFrame({'Id': X_test_clean.index,
#                        'SalePrice': rf_preds_test})
# output.to_csv('rf_submission.csv', index=False)

In [None]:
# use early stopping rounds with verbose to find a good number of n_estimators for early stopping round
# print(test_model(xgb.XGBRegressor(random_state=0, n_estimators=1000), preprocessor_1))
# ans -> 18493.424433342276

In [None]:
# use early stopping rounds with verbose to find a good number of n_estimators for early stopping round
# print(test_model(xgb.XGBRegressor(random_state=0, n_estimators=1000, learning_rate=0.05), preprocessor_1))
# ans ->16457.97133396267

In [None]:
# use early stopping rounds with verbose to find a good number of n_estimators for early stopping round
# print(test_model(xgb.XGBRegressor(random_state=0, n_estimators=1000, learning_rate=0.01), preprocessor_1))
# ans -> 16555.01474050899

In [None]:
# use early stopping rounds with verbose to find a good number of n_estimators for early stopping round
# print(test_model(xgb.XGBRegressor(random_state=0, n_estimators=1000, learning_rate=0.2), preprocessor_1))
# ans -> 17707.34432746185

In [None]:
# use early stopping rounds with verbose to find a good number of n_estimators for early stopping round
# print(test_model(xgb.XGBRegressor(random_state=0, n_estimators=1000, learning_rate=0.008), preprocessor_1))
# ans -> 16497.30312636126

In [None]:
# use early stopping rounds with verbose to find a good number of n_estimators for early stopping round
# print(test_model(xgb.XGBRegressor(random_state=0, n_estimators=1000, learning_rate=0.008), preprocessor_1))
# ans -> 16377.387845803645

In [172]:
# use early stopping rounds with verbose to find a good number of n_estimators for early stopping round
print(test_model(xgb.XGBRegressor(random_state=0, n_estimators=1500, learning_rate=0.008), preprocessor_1))
# ans -> 16362.271127724245

[0]	validation_0-mae:179941.56250
[1]	validation_0-mae:178525.21875
[2]	validation_0-mae:177119.96875
[3]	validation_0-mae:175727.85938
[4]	validation_0-mae:174360.12500
[5]	validation_0-mae:173002.28125
[6]	validation_0-mae:171650.26562
[7]	validation_0-mae:170314.28125
[8]	validation_0-mae:168978.54688
[9]	validation_0-mae:167659.20312
[10]	validation_0-mae:166351.82812
[11]	validation_0-mae:165047.50000
[12]	validation_0-mae:163756.23438
[13]	validation_0-mae:162473.46875
[14]	validation_0-mae:161203.71875
[15]	validation_0-mae:159950.59375
[16]	validation_0-mae:158693.71875
[17]	validation_0-mae:157465.00000
[18]	validation_0-mae:156232.71875
[19]	validation_0-mae:155030.00000
[20]	validation_0-mae:153825.25000
[21]	validation_0-mae:152631.79688
[22]	validation_0-mae:151461.87500
[23]	validation_0-mae:150273.95312
[24]	validation_0-mae:149119.28125
[25]	validation_0-mae:147952.15625
[26]	validation_0-mae:146805.60938
[27]	validation_0-mae:145668.35938
[28]	validation_0-mae:144526.1

[236]	validation_0-mae:31856.14062
[237]	validation_0-mae:31667.38086
[238]	validation_0-mae:31484.68555
[239]	validation_0-mae:31302.95898
[240]	validation_0-mae:31119.53125
[241]	validation_0-mae:30938.17383
[242]	validation_0-mae:30764.28711
[243]	validation_0-mae:30588.39062
[244]	validation_0-mae:30414.52148
[245]	validation_0-mae:30236.50000
[246]	validation_0-mae:30062.75977
[247]	validation_0-mae:29894.60938
[248]	validation_0-mae:29729.55469
[249]	validation_0-mae:29568.60352
[250]	validation_0-mae:29411.75977
[251]	validation_0-mae:29248.83594
[252]	validation_0-mae:29094.06250
[253]	validation_0-mae:28940.66406
[254]	validation_0-mae:28783.81836
[255]	validation_0-mae:28634.07031
[256]	validation_0-mae:28484.72266
[257]	validation_0-mae:28336.04883
[258]	validation_0-mae:28184.84180
[259]	validation_0-mae:28028.94922
[260]	validation_0-mae:27877.37305
[261]	validation_0-mae:27723.68555
[262]	validation_0-mae:27574.99805
[263]	validation_0-mae:27431.46094
[264]	validation_0-m

[471]	validation_0-mae:17464.22266
[472]	validation_0-mae:17458.87500
[473]	validation_0-mae:17450.48047
[474]	validation_0-mae:17446.08789
[475]	validation_0-mae:17439.00586
[476]	validation_0-mae:17435.24219
[477]	validation_0-mae:17429.87305
[478]	validation_0-mae:17423.51367
[479]	validation_0-mae:17417.77930
[480]	validation_0-mae:17414.28320
[481]	validation_0-mae:17408.88281
[482]	validation_0-mae:17401.85547
[483]	validation_0-mae:17396.49023
[484]	validation_0-mae:17390.96484
[485]	validation_0-mae:17384.91797
[486]	validation_0-mae:17377.56445
[487]	validation_0-mae:17372.65430
[488]	validation_0-mae:17367.27148
[489]	validation_0-mae:17362.25977
[490]	validation_0-mae:17356.62305
[491]	validation_0-mae:17350.25977
[492]	validation_0-mae:17343.77734
[493]	validation_0-mae:17336.70508
[494]	validation_0-mae:17331.61719
[495]	validation_0-mae:17324.42383
[496]	validation_0-mae:17318.32227
[497]	validation_0-mae:17312.21875
[498]	validation_0-mae:17307.83398
[499]	validation_0-m

[706]	validation_0-mae:16874.14062
[707]	validation_0-mae:16874.39453
[708]	validation_0-mae:16874.83984
[709]	validation_0-mae:16873.23633
[710]	validation_0-mae:16873.40039
[711]	validation_0-mae:16872.50195
[712]	validation_0-mae:16871.99219
[713]	validation_0-mae:16872.58008
[714]	validation_0-mae:16872.69727
[715]	validation_0-mae:16871.14062
[716]	validation_0-mae:16870.47852
[717]	validation_0-mae:16871.85938
[718]	validation_0-mae:16871.30273
[719]	validation_0-mae:16872.67383
[720]	validation_0-mae:16872.79102
[721]	validation_0-mae:16871.57227
[722]	validation_0-mae:16871.33984
[723]	validation_0-mae:16871.81836
[724]	validation_0-mae:16871.27148
[0]	validation_0-mae:179924.07812
[1]	validation_0-mae:178489.15625
[2]	validation_0-mae:177065.82812
[3]	validation_0-mae:175652.45312
[4]	validation_0-mae:174249.04688
[5]	validation_0-mae:172854.95312
[6]	validation_0-mae:171486.45312
[7]	validation_0-mae:170114.54688
[8]	validation_0-mae:168764.96875
[9]	validation_0-mae:167409.5

[217]	validation_0-mae:35197.73438
[218]	validation_0-mae:34987.20703
[219]	validation_0-mae:34777.21094
[220]	validation_0-mae:34569.64844
[221]	validation_0-mae:34364.33594
[222]	validation_0-mae:34155.90234
[223]	validation_0-mae:33949.62500
[224]	validation_0-mae:33750.29297
[225]	validation_0-mae:33544.12891
[226]	validation_0-mae:33348.00000
[227]	validation_0-mae:33146.52734
[228]	validation_0-mae:32959.45312
[229]	validation_0-mae:32761.19531
[230]	validation_0-mae:32561.29492
[231]	validation_0-mae:32370.06836
[232]	validation_0-mae:32177.93750
[233]	validation_0-mae:31986.78516
[234]	validation_0-mae:31795.87695
[235]	validation_0-mae:31609.75977
[236]	validation_0-mae:31422.02148
[237]	validation_0-mae:31237.54102
[238]	validation_0-mae:31055.05859
[239]	validation_0-mae:30878.23242
[240]	validation_0-mae:30699.62695
[241]	validation_0-mae:30522.33594
[242]	validation_0-mae:30350.57812
[243]	validation_0-mae:30180.17773
[244]	validation_0-mae:30007.55469
[245]	validation_0-m

[452]	validation_0-mae:18855.07422
[453]	validation_0-mae:18849.34180
[454]	validation_0-mae:18840.95508
[455]	validation_0-mae:18832.81641
[456]	validation_0-mae:18825.68359
[457]	validation_0-mae:18820.28516
[458]	validation_0-mae:18811.83789
[459]	validation_0-mae:18804.97461
[460]	validation_0-mae:18798.20117
[461]	validation_0-mae:18790.92188
[462]	validation_0-mae:18784.52344
[463]	validation_0-mae:18778.74219
[464]	validation_0-mae:18772.19531
[465]	validation_0-mae:18766.53711
[466]	validation_0-mae:18760.08203
[467]	validation_0-mae:18754.37695
[468]	validation_0-mae:18749.45508
[469]	validation_0-mae:18744.00000
[470]	validation_0-mae:18740.06445
[471]	validation_0-mae:18734.45312
[472]	validation_0-mae:18729.61523
[473]	validation_0-mae:18725.11133
[474]	validation_0-mae:18718.82812
[475]	validation_0-mae:18715.10352
[476]	validation_0-mae:18710.94727
[477]	validation_0-mae:18704.76758
[478]	validation_0-mae:18700.39648
[479]	validation_0-mae:18694.41602
[480]	validation_0-m

[687]	validation_0-mae:18282.05859
[688]	validation_0-mae:18281.30273
[689]	validation_0-mae:18280.90625
[690]	validation_0-mae:18280.45898
[691]	validation_0-mae:18279.71484
[692]	validation_0-mae:18278.95898
[693]	validation_0-mae:18278.68945
[694]	validation_0-mae:18276.99219
[695]	validation_0-mae:18276.49219
[696]	validation_0-mae:18276.13086
[697]	validation_0-mae:18274.45898
[698]	validation_0-mae:18274.37500
[699]	validation_0-mae:18274.40820
[700]	validation_0-mae:18272.75977
[701]	validation_0-mae:18270.89648
[702]	validation_0-mae:18271.92383
[703]	validation_0-mae:18271.46289
[704]	validation_0-mae:18269.83594
[705]	validation_0-mae:18269.13477
[706]	validation_0-mae:18267.38477
[707]	validation_0-mae:18267.08984
[708]	validation_0-mae:18266.86719
[709]	validation_0-mae:18268.00195
[710]	validation_0-mae:18267.52148
[711]	validation_0-mae:18266.34570
[712]	validation_0-mae:18265.02734
[713]	validation_0-mae:18264.76758
[714]	validation_0-mae:18263.24414
[715]	validation_0-m

[133]	validation_0-mae:64502.08203
[134]	validation_0-mae:64015.26172
[135]	validation_0-mae:63534.41016
[136]	validation_0-mae:63056.54688
[137]	validation_0-mae:62581.14453
[138]	validation_0-mae:62113.28125
[139]	validation_0-mae:61647.92578
[140]	validation_0-mae:61187.81641
[141]	validation_0-mae:60737.61719
[142]	validation_0-mae:60284.32031
[143]	validation_0-mae:59830.56250
[144]	validation_0-mae:59388.31641
[145]	validation_0-mae:58948.67188
[146]	validation_0-mae:58517.02734
[147]	validation_0-mae:58085.78906
[148]	validation_0-mae:57655.62891
[149]	validation_0-mae:57238.46094
[150]	validation_0-mae:56818.74609
[151]	validation_0-mae:56410.67188
[152]	validation_0-mae:55994.61328
[153]	validation_0-mae:55582.76172
[154]	validation_0-mae:55180.12500
[155]	validation_0-mae:54780.28906
[156]	validation_0-mae:54381.23828
[157]	validation_0-mae:53993.07031
[158]	validation_0-mae:53608.54297
[159]	validation_0-mae:53218.65234
[160]	validation_0-mae:52841.67969
[161]	validation_0-m

[368]	validation_0-mae:19664.95508
[369]	validation_0-mae:19623.76172
[370]	validation_0-mae:19593.74805
[371]	validation_0-mae:19555.20898
[372]	validation_0-mae:19523.08008
[373]	validation_0-mae:19484.72852
[374]	validation_0-mae:19448.65625
[375]	validation_0-mae:19410.78906
[376]	validation_0-mae:19374.66016
[377]	validation_0-mae:19348.00586
[378]	validation_0-mae:19311.62305
[379]	validation_0-mae:19276.09375
[380]	validation_0-mae:19240.96484
[381]	validation_0-mae:19215.09375
[382]	validation_0-mae:19183.03711
[383]	validation_0-mae:19153.31836
[384]	validation_0-mae:19119.91211
[385]	validation_0-mae:19093.93750
[386]	validation_0-mae:19065.20117
[387]	validation_0-mae:19038.37500
[388]	validation_0-mae:19008.76562
[389]	validation_0-mae:18975.58203
[390]	validation_0-mae:18948.21875
[391]	validation_0-mae:18917.10938
[392]	validation_0-mae:18885.29102
[393]	validation_0-mae:18855.78320
[394]	validation_0-mae:18832.08984
[395]	validation_0-mae:18803.81445
[396]	validation_0-m

[603]	validation_0-mae:17098.78711
[604]	validation_0-mae:17098.58008
[605]	validation_0-mae:17094.55273
[606]	validation_0-mae:17094.55469
[607]	validation_0-mae:17093.63086
[608]	validation_0-mae:17089.64648
[609]	validation_0-mae:17088.82227
[610]	validation_0-mae:17088.63477
[611]	validation_0-mae:17088.75391
[612]	validation_0-mae:17087.22656
[613]	validation_0-mae:17086.45508
[614]	validation_0-mae:17085.87109
[615]	validation_0-mae:17083.69141
[616]	validation_0-mae:17079.80469
[617]	validation_0-mae:17079.67383
[618]	validation_0-mae:17075.76172
[619]	validation_0-mae:17075.40625
[620]	validation_0-mae:17074.29102
[621]	validation_0-mae:17073.56836
[622]	validation_0-mae:17073.57617
[623]	validation_0-mae:17073.20703
[624]	validation_0-mae:17072.93750
[625]	validation_0-mae:17073.64062
[626]	validation_0-mae:17069.82227
[627]	validation_0-mae:17069.04688
[628]	validation_0-mae:17065.59961
[629]	validation_0-mae:17065.89062
[630]	validation_0-mae:17062.84570
[631]	validation_0-m

[838]	validation_0-mae:16859.66602
[839]	validation_0-mae:16858.70898
[840]	validation_0-mae:16857.79492
[841]	validation_0-mae:16857.03125
[842]	validation_0-mae:16856.64258
[843]	validation_0-mae:16855.23047
[844]	validation_0-mae:16854.97852
[845]	validation_0-mae:16853.63477
[846]	validation_0-mae:16853.39453
[847]	validation_0-mae:16852.89648
[848]	validation_0-mae:16852.08984
[849]	validation_0-mae:16851.22656
[850]	validation_0-mae:16850.22461
[851]	validation_0-mae:16849.90820
[852]	validation_0-mae:16850.26367
[853]	validation_0-mae:16849.08203
[854]	validation_0-mae:16849.30469
[855]	validation_0-mae:16848.89648
[856]	validation_0-mae:16848.14453
[857]	validation_0-mae:16847.73828
[858]	validation_0-mae:16846.74219
[859]	validation_0-mae:16846.33984
[860]	validation_0-mae:16845.75586
[861]	validation_0-mae:16845.64453
[862]	validation_0-mae:16845.27344
[863]	validation_0-mae:16845.71289
[864]	validation_0-mae:16846.31055
[865]	validation_0-mae:16846.84766
[866]	validation_0-m

[93]	validation_0-mae:88165.09375
[94]	validation_0-mae:87508.79688
[95]	validation_0-mae:86844.19531
[96]	validation_0-mae:86198.91406
[97]	validation_0-mae:85575.57031
[98]	validation_0-mae:84944.54688
[99]	validation_0-mae:84302.63281
[100]	validation_0-mae:83683.01562
[101]	validation_0-mae:83060.08594
[102]	validation_0-mae:82434.85156
[103]	validation_0-mae:81825.41406
[104]	validation_0-mae:81221.18750
[105]	validation_0-mae:80627.33594
[106]	validation_0-mae:80034.18750
[107]	validation_0-mae:79438.79688
[108]	validation_0-mae:78854.03906
[109]	validation_0-mae:78261.28906
[110]	validation_0-mae:77665.53125
[111]	validation_0-mae:77101.80469
[112]	validation_0-mae:76538.35156
[113]	validation_0-mae:75983.08594
[114]	validation_0-mae:75411.89844
[115]	validation_0-mae:74845.25781
[116]	validation_0-mae:74298.40625
[117]	validation_0-mae:73755.28906
[118]	validation_0-mae:73197.27344
[119]	validation_0-mae:72652.96094
[120]	validation_0-mae:72126.75000
[121]	validation_0-mae:7159

[328]	validation_0-mae:22198.63867
[329]	validation_0-mae:22137.52344
[330]	validation_0-mae:22078.16016
[331]	validation_0-mae:22026.90234
[332]	validation_0-mae:21972.04883
[333]	validation_0-mae:21920.67188
[334]	validation_0-mae:21868.26367
[335]	validation_0-mae:21814.22656
[336]	validation_0-mae:21762.56836
[337]	validation_0-mae:21710.52930
[338]	validation_0-mae:21660.19531
[339]	validation_0-mae:21607.07812
[340]	validation_0-mae:21558.35352
[341]	validation_0-mae:21511.44922
[342]	validation_0-mae:21463.45508
[343]	validation_0-mae:21418.48438
[344]	validation_0-mae:21368.10938
[345]	validation_0-mae:21324.44922
[346]	validation_0-mae:21280.07812
[347]	validation_0-mae:21241.04688
[348]	validation_0-mae:21197.86328
[349]	validation_0-mae:21154.72852
[350]	validation_0-mae:21111.05273
[351]	validation_0-mae:21067.71289
[352]	validation_0-mae:21024.99219
[353]	validation_0-mae:20984.18750
[354]	validation_0-mae:20943.98633
[355]	validation_0-mae:20903.23242
[356]	validation_0-m

[563]	validation_0-mae:17963.30273
[564]	validation_0-mae:17958.75391
[565]	validation_0-mae:17954.23242
[566]	validation_0-mae:17950.23438
[567]	validation_0-mae:17948.34570
[568]	validation_0-mae:17943.79102
[569]	validation_0-mae:17938.23438
[570]	validation_0-mae:17932.49219
[571]	validation_0-mae:17930.14453
[572]	validation_0-mae:17926.30469
[573]	validation_0-mae:17919.57227
[574]	validation_0-mae:17920.47852
[575]	validation_0-mae:17917.29688
[576]	validation_0-mae:17914.89258
[577]	validation_0-mae:17912.41406
[578]	validation_0-mae:17907.17383
[579]	validation_0-mae:17908.20312
[580]	validation_0-mae:17906.15820
[581]	validation_0-mae:17903.81836
[582]	validation_0-mae:17900.35547
[583]	validation_0-mae:17898.04102
[584]	validation_0-mae:17899.08984
[585]	validation_0-mae:17896.06445
[586]	validation_0-mae:17892.43945
[587]	validation_0-mae:17888.77344
[588]	validation_0-mae:17886.92773
[589]	validation_0-mae:17887.99219
[590]	validation_0-mae:17882.60547
[591]	validation_0-m

[798]	validation_0-mae:17606.31445
[799]	validation_0-mae:17605.32812
[800]	validation_0-mae:17605.22461
[801]	validation_0-mae:17603.68750
[802]	validation_0-mae:17603.26758
[803]	validation_0-mae:17603.62500
[804]	validation_0-mae:17602.09961
[805]	validation_0-mae:17602.60156
[806]	validation_0-mae:17601.83594
[807]	validation_0-mae:17601.75977
[808]	validation_0-mae:17601.28516
[809]	validation_0-mae:17601.81445
[810]	validation_0-mae:17601.13281
[811]	validation_0-mae:17601.80664
[812]	validation_0-mae:17602.35742
[813]	validation_0-mae:17601.93945
[814]	validation_0-mae:17602.07617
[815]	validation_0-mae:17600.82031
[816]	validation_0-mae:17601.29102
[817]	validation_0-mae:17600.55078
[818]	validation_0-mae:17600.14453
[819]	validation_0-mae:17600.53516
[820]	validation_0-mae:17601.00391
[821]	validation_0-mae:17599.97852
[822]	validation_0-mae:17600.35352
[823]	validation_0-mae:17600.07031
[824]	validation_0-mae:17600.46875
[825]	validation_0-mae:17599.17773
[826]	validation_0-m

[1032]	validation_0-mae:17438.90430
[1033]	validation_0-mae:17438.70508
[1034]	validation_0-mae:17437.88477
[1035]	validation_0-mae:17437.42773
[1036]	validation_0-mae:17437.46680
[1037]	validation_0-mae:17436.79883
[1038]	validation_0-mae:17436.58984
[1039]	validation_0-mae:17434.82422
[1040]	validation_0-mae:17434.35352
[1041]	validation_0-mae:17433.19727
[1042]	validation_0-mae:17432.74219
[1043]	validation_0-mae:17432.67969
[1044]	validation_0-mae:17431.57617
[1045]	validation_0-mae:17430.92188
[1046]	validation_0-mae:17430.03125
[1047]	validation_0-mae:17428.85352
[1048]	validation_0-mae:17428.83984
[1049]	validation_0-mae:17427.84961
[1050]	validation_0-mae:17426.00391
[1051]	validation_0-mae:17425.56250
[1052]	validation_0-mae:17424.86914
[1053]	validation_0-mae:17423.83594
[1054]	validation_0-mae:17423.16016
[1055]	validation_0-mae:17422.06836
[1056]	validation_0-mae:17420.84961
[1057]	validation_0-mae:17421.02539
[1058]	validation_0-mae:17420.80078
[1059]	validation_0-mae:1742

[54]	validation_0-mae:117293.45312
[55]	validation_0-mae:116365.90625
[56]	validation_0-mae:115444.64062
[57]	validation_0-mae:114522.25000
[58]	validation_0-mae:113610.63281
[59]	validation_0-mae:112700.85156
[60]	validation_0-mae:111817.23438
[61]	validation_0-mae:110935.36719
[62]	validation_0-mae:110080.45312
[63]	validation_0-mae:109218.45312
[64]	validation_0-mae:108370.25781
[65]	validation_0-mae:107524.88281
[66]	validation_0-mae:106692.07031
[67]	validation_0-mae:105866.57812
[68]	validation_0-mae:105047.27344
[69]	validation_0-mae:104239.82812
[70]	validation_0-mae:103423.35938
[71]	validation_0-mae:102617.57031
[72]	validation_0-mae:101813.08594
[73]	validation_0-mae:101047.09375
[74]	validation_0-mae:100259.78125
[75]	validation_0-mae:99477.46875
[76]	validation_0-mae:98704.98438
[77]	validation_0-mae:97958.82031
[78]	validation_0-mae:97215.98438
[79]	validation_0-mae:96475.69531
[80]	validation_0-mae:95731.71875
[81]	validation_0-mae:94979.14062
[82]	validation_0-mae:94264

[289]	validation_0-mae:24285.22656
[290]	validation_0-mae:24194.51367
[291]	validation_0-mae:24100.78516
[292]	validation_0-mae:24016.76758
[293]	validation_0-mae:23928.38086
[294]	validation_0-mae:23848.60547
[295]	validation_0-mae:23771.92383
[296]	validation_0-mae:23693.60352
[297]	validation_0-mae:23617.10938
[298]	validation_0-mae:23537.39453
[299]	validation_0-mae:23462.73633
[300]	validation_0-mae:23384.53516
[301]	validation_0-mae:23310.32617
[302]	validation_0-mae:23233.71289
[303]	validation_0-mae:23156.99219
[304]	validation_0-mae:23083.89648
[305]	validation_0-mae:23010.86719
[306]	validation_0-mae:22937.45898
[307]	validation_0-mae:22866.98633
[308]	validation_0-mae:22796.92188
[309]	validation_0-mae:22723.33594
[310]	validation_0-mae:22653.87109
[311]	validation_0-mae:22589.52734
[312]	validation_0-mae:22525.17188
[313]	validation_0-mae:22460.86914
[314]	validation_0-mae:22396.00977
[315]	validation_0-mae:22332.18750
[316]	validation_0-mae:22265.79883
[317]	validation_0-m

[524]	validation_0-mae:17788.02344
[525]	validation_0-mae:17783.15820
[526]	validation_0-mae:17777.79492
[527]	validation_0-mae:17772.60938
[528]	validation_0-mae:17768.62305
[529]	validation_0-mae:17764.71484
[530]	validation_0-mae:17760.84570
[531]	validation_0-mae:17757.11328
[532]	validation_0-mae:17752.39062
[533]	validation_0-mae:17748.54102
[534]	validation_0-mae:17745.64648
[535]	validation_0-mae:17741.05469
[536]	validation_0-mae:17737.60938
[537]	validation_0-mae:17733.11719
[538]	validation_0-mae:17730.99609
[539]	validation_0-mae:17729.45312
[540]	validation_0-mae:17725.97852
[541]	validation_0-mae:17721.13281
[542]	validation_0-mae:17719.21875
[543]	validation_0-mae:17716.02930
[544]	validation_0-mae:17711.83203
[545]	validation_0-mae:17709.98828
[546]	validation_0-mae:17705.73242
[547]	validation_0-mae:17701.75000
[548]	validation_0-mae:17697.05078
[549]	validation_0-mae:17693.98047
[550]	validation_0-mae:17690.45508
[551]	validation_0-mae:17687.31250
[552]	validation_0-m

[759]	validation_0-mae:17393.12305
[760]	validation_0-mae:17391.66992
[761]	validation_0-mae:17389.65430
[762]	validation_0-mae:17390.17773
[763]	validation_0-mae:17387.95117
[764]	validation_0-mae:17386.93945
[765]	validation_0-mae:17384.49609
[766]	validation_0-mae:17383.34961
[767]	validation_0-mae:17382.99219
[768]	validation_0-mae:17381.29688
[769]	validation_0-mae:17379.83594
[770]	validation_0-mae:17379.48438
[771]	validation_0-mae:17378.19922
[772]	validation_0-mae:17377.94336
[773]	validation_0-mae:17378.30859
[774]	validation_0-mae:17376.86719
[775]	validation_0-mae:17377.03516
[776]	validation_0-mae:17376.57227
[777]	validation_0-mae:17374.90430
[778]	validation_0-mae:17373.14648
[779]	validation_0-mae:17373.00586
[780]	validation_0-mae:17373.54102
[781]	validation_0-mae:17373.29688
[782]	validation_0-mae:17372.04883
[783]	validation_0-mae:17372.55469
[784]	validation_0-mae:17371.39648
[785]	validation_0-mae:17369.99805
[786]	validation_0-mae:17368.78516
[787]	validation_0-m

# XGBoost with Early Stopping Params
* Test 2 combinations: 
1. random_state=0, n_estimators=1500, learning_rate=0.01 and early_stopping_rounds set to 20 (may be overfitting)
2. random_state=0, n_estimators=175 n_estimators=300, n_estimators=500, learning_rate=0.05 (may be underfitting)


In [171]:
# try out the optimal xgboost
xbg_model_1 = xgb.XGBRegressor(n_estimators=900, learning_rate=0.01, random_state=0)
# create the pipeline for the final model
xgb_pipeline_1 = Pipeline(steps = [
    ('preprocess', preprocessor_1),
    ('model', xbg_model_1)
])
xgb_pipeline_1.fit(X_clean, y)

xgb_pred_1 = xgb_pipeline_1.predict(X_test_clean)

In [None]:
output = pd.DataFrame({'Id': X_test_clean.index,
                       'SalePrice': xgb_pred_1})
output.to_csv('xgb_submission.csv', index=False)

In [None]:
# more xgb models with a lower n estimators
# xgb_model_6 = xgb.XGBRegressor(n_estimators=10, random_state=0)
# xgb_model_7 = xgb.XGBRegressor(n_estimators=25, random_state=0)
# xgb_model_8 = xgb.XGBRegressor(n_estimators=35, random_state=0)
# xgb_model_9 = xgb.XGBRegressor(n_estimators=55, random_state=0)
# xgb_model_10 = xgb.XGBRegressor(n_estimators=75, random_state=0)
# xgb_model_11 = xgb.XGBRegressor(n_estimators=20, random_state=0)
# xgb_model_12 = xgb.XGBRegressor(n_estimators=15, random_state=0)

# xgb_list_2 = [xgb_model_6, xgb_model_7, xgb_model_8, xgb_model_9, xgb_model_10, xgb_model_11, xgb_model_12]
# xgb_accuracy_2 = []
# for model in xgb_list_2:
#     xgb_accuracy_2.append(test_model(model, preprocessor_1))

In [None]:
# num_estimator_vals = [10, 25, 35, 55, 75, 20, 15]
# ax_7 = sns.lineplot(x=num_estimator_vals, y=xgb_accuracy_2)
# ax_7.set_title('XGB Accuracy vs Number of Estimator Values')
# ax_7.set_xlabel('Number of Estimator Values')
# ax_7.set_ylabel('House Price Prediction Error ($)')

In [None]:
# change the learning rate to increase model accuracy
# xgb_model_13 = xgb.XGBRegressor(n_estimators=20, learning_rate=0.2, random_state=0)
# xgb_model_14 = xgb.XGBRegressor(n_estimators=20, learning_rate=0.1, random_state=0)
# xgb_model_15 = xgb.XGBRegressor(n_estimators=20, learning_rate=0.05, random_state=0)
# xgb_model_16 = xgb.XGBRegressor(n_estimators=20, learning_rate=0.01, random_state=0)
# xgb_model_17 = xgb.XGBRegressor(n_estimators=20, learning_rate=0.008, random_state=0)

# xgb_list_4 = [xgb_model_13, xgb_model_14, xgb_model_15, xgb_model_16, xgb_model_17]
# xgb_accuracy_4 = []
# for model in xgb_list_4:
#     xgb_accuracy_4.append(test_model(model, preprocessor_1))

In [None]:
# learn_rate_vals = [0.2, 0.1, 0.05, 0.01, 0.008]
# ax_8 = sns.lineplot(x=learn_rate_vals, y=xgb_accuracy_4)
# ax_8.set_title('XGB Accuracy vs Number of Estimator Values')
# ax_8.set_xlabel('Learning Rate')
# ax_8.set_ylabel('House Price Prediction Error ($)')

In [None]:
# change the learning rate to increase model accuracy again
# xgb_model_13 = xgb.XGBRegressor(n_estimators=35, learning_rate=0.05, random_state=0)
# print(test_model(xgb_model_13, preprocessor_1))