# **Reduce Complexity & Training Time By Not Using Object Features (28 Features Remaining) For House Predictions**
---
**1. ANALYZING AND CLEANING DATA**

In [None]:
import warnings
warnings.filterwarnings("ignore")

import tensorflow as tf
import pandas as pd
import numpy as np
import copy
import seaborn as sns
import matplotlib.pyplot as plt


from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
%matplotlib inline

In [None]:
data = pd.read_csv('/share/dutta/eyao/dataset/kaggle/house-prices-advanced-regression-techniques/train.csv') 
print("Table Shape: {}".format(data.shape)) 

In [None]:
data.head(7) # See 7 information row at the top of dataset

* The **missing data** appear in many columns such as: Alley, PoolQC, Fence, MiscFeature,...

In [None]:
null_class = data.isnull().sum() 
print(null_class[null_class != 0])

In [None]:
data.describe() 

Evaluate each data and how to handle each feature base on above tables

1. Alley, PoolQC, Fence, MiscFeature:
> * Because the number of missing value is very large => It will adversely affects the regression model. 
> * Solution: I will remove these features (delete NaN columns).
2. MasVnrType, MasVnrArea, Electrical:
> * Because the number of missing values is very small => It doesn't affect the model much. 
> * Solution: I will remove these samples (delete NaN rows).
3. BsmtQual, BsmtCond, BsmtExposure,BsmtFinType1, BsmtFinType2, GarageType, GarageYrBlt, GarageFinish, GarageQual, GarageCond:
> * These are basement and garage data and they represent only 2.5% to 5.5% of the total sample.
> * Because these are the features chosen according to human behavior. I will replace NaN with the most common value for each feature.
4. LotFrontage, FireplaceQu:
> Because the missing data of LotFrontage (17.7%) is quite a lot but 25% and 75% stats don't change much. FireplaceQu accounts for almost 50% of all samples.
> * Solution: LotFrontage - I'll replace these values with the mean. FireplaceQu - I will remove this feature.
5. Object Features.
> * Solution: convert feature objects into one hot encoding.

In [None]:
def preprocess(df, train_data=True):
    drop_rows_list = ["MasVnrType", "MasVnrArea", "Electrical"]
    if train_data:
        # Drop NaN rows
        for sample_row in drop_rows_list:
            df.drop(df[df[sample_row].isnull()].index, inplace=True)

    # Replace NaN with the most common value
    most_common_list = ["BsmtQual", "BsmtCond", "BsmtExposure","BsmtFinType1", "BsmtFinType2", 
                        "GarageType", "GarageYrBlt", "GarageFinish", "GarageQual", "GarageCond"]
    for sample_row in most_common_list:
        df[sample_row].fillna(df[sample_row].mode()[0], inplace=True)

    # Replace NaN with Mean  
    df.LotFrontage.fillna(df.LotFrontage.mean(), inplace=True)

    # Remove some unecessary columns
    removed_features_list = ["Alley", "PoolQC", "Fence", "MiscFeature", "FireplaceQu", "Id"]
    for feature in removed_features_list:
        del df[feature]
    
    if not train_data:
        # Replace NaN with the  most common value for test set 
        for sample_row in test_set.isnull().columns:
            df[sample_row].fillna(df[sample_row].mode()[0], inplace=True)
            
    return df

def preprocess_no_obj_feature(df, train_data=True):
    drop_rows_list = ["MasVnrType", "MasVnrArea", "Electrical"]
    if train_data:
        # Drop NaN rows
        for sample_row in drop_rows_list:
            df.drop(df[df[sample_row].isnull()].index, inplace=True)

    # Replace NaN with the most common value
    most_common_list = ["BsmtQual", "BsmtCond", "BsmtExposure","BsmtFinType1", "BsmtFinType2", 
                        "GarageType", "GarageYrBlt", "GarageFinish", "GarageQual", "GarageCond"]
    for sample_row in  data.select_dtypes(include=['object']).columns:
        df[sample_row].fillna(df[sample_row].mode()[0], inplace=True)

    # Replace NaN with Mean  
    df.LotFrontage.fillna(df.LotFrontage.mean(), inplace=True)
    
    # Remove some unecessary columns
    removed_features_list = ["Alley", "PoolQC", "Fence", "MiscFeature", "FireplaceQu", "Id"]
    for feature in removed_features_list:
        del df[feature]
        
    # Delete all remaining object features
    for feature in df.select_dtypes(include=['object']).columns:
        del df[feature]
    
    if not train_data:
        # Replace NaN with the  most common value for test set 
        for sample_row in test_set.isnull().columns:
            df[sample_row].fillna(df[sample_row].mode()[0], inplace=True)
            
    return df

# We can choose 1 of 2 preprocess methods.
# data = preprocess(data, train_data=True) # This preprocess for using object features
data = preprocess_no_obj_feature(data, train_data=True) # Do not use object features

In [None]:
data.head(3) 

---
**2. EXAMINING AND EXPLORING DATA**

In [None]:
correlation_matrix = pd.DataFrame.corr(data) 
correlation_matrix

In [None]:
upper_bound_threshold, lower_bound_threshold = 0.8, -0.3
# Find features with high correlation scores
high_corr_features = np.where(correlation_matrix > upper_bound_threshold)
neg_corr_features = np.where(correlation_matrix < lower_bound_threshold)
high_corr_features = [(correlation_matrix.columns[i], correlation_matrix.columns[j])
                      for i, j in zip(*high_corr_features) if i != j]
neg_corr_features = [(correlation_matrix.columns[i], correlation_matrix.columns[j])
                      for i, j in zip(*neg_corr_features) if i != j]
# Convert to a set of unique features
high_corr_features = set(feature for pair in high_corr_features for feature in pair)
neg_corr_features = set(feature for pair in neg_corr_features for feature in pair)
# Remove high correlated features from the dataset
data_without_high_corr = data.drop(columns=high_corr_features)
# data_without_high_corr = data_without_high_corr.drop(columns=neg_corr_features.difference(high_corr_features))

In [None]:
# # SKIP this cell if do not use object features. Uncomment if use object features
# # Visualize the boxplot to find the outlier for all object features
# feats_for_find_outlier = (data_without_high_corr.dtypes[data_without_high_corr.dtypes == object]).keys().values.reshape(-1, 2)
# num_row, num_col = feats_for_find_outlier.shape[0], feats_for_find_outlier.shape[1]
# fig, ax = plt.subplots(num_row, num_col, figsize=(12, 60)) 
# for row in range(num_row):
#     for col in range(num_col):
#         sns.boxplot(data=data_without_high_corr, x=feats_for_find_outlier[row,col], y='SalePrice', ax = ax[row,col], dodge=False)
# plt.tight_layout() 
# plt.show() 

> Based on the diagrams above, I select sub-category to remove outliers. Because the number of data rows is not much, I will only select 3 columns (features) to remove outlier: RoofStyle (Gable), BsmtCond (TA), SaleCondition (Abnorml).

In [None]:
# # SKIP this cell if do not use object features. Uncomment if use object features
# col_feats = ["RoofStyle", "BsmtCond", "SaleCondition"]
# categorical_of_col = ["Gable","TA","Abnorml"]

# def find_combination(categorical_of_col, num_items_to_select):
#     combinations_of_cat = list(combinations(categorical_of_col, num_items_to_select))
#     return combinations_of_cat

# def find_outlier_threshold(df_in, target_col, in_col, in_category):
#     price_by_cat = df_in[target_col][df_in[in_col]==in_category] 
#     q1 = price_by_cat.quantile(0.25)
#     q3 = price_by_cat.quantile(0.75)
#     iqr = q3-q1
#     f_low  = q1 - 1.5 * iqr
#     f_high = q3 + 1.5 * iqr
#     return f_low, f_high 

# def remove_outlier(df_in, target_col, col_feats, categorical_of_col, fence_low, fence_high):
#     remove_ind = []
#     for ind in df_in.index:
#         if df_in[col_feats][ind] == categorical_of_col:
#             if (df_in[target_col][ind] > fence_low) and (df_in[target_col][ind] < fence_high):
#                 remove_ind.append(ind)
#     for ind in remove_ind:
#         df_in = df_in.drop(ind)
#     return df_in
                
# # Find threshold for each feature
# fence_low, fence_high = [0]*len(categorical_of_col), [0]*len(categorical_of_col)
# for i in range(len(categorical_of_col)):
#     fence_low[i], fence_high[i] = find_outlier_threshold(data_without_high_corr, "SalePrice", col_feats[i], categorical_of_col[i])
# for i in range(len(fence_low)):
#     removed_outlier_data = remove_outlier(data_without_high_corr, "SalePrice", col_feats[i], categorical_of_col[i], fence_low[i], fence_high[i])

In [None]:
# # SKIP this cell if do not use object features.
# # For sure test and train have the same dummies
# test_set = pd.read_csv('/share/dutta/eyao/dataset/kaggle/house-prices-advanced-regression-techniques/test.csv') 
# ids = test_set.Id
# test_set = preprocess(test_set, train_data=False)
# test_set = test_set.drop(columns=high_corr_features)
# test_set = test_set.drop(columns=neg_corr_features.difference(high_corr_features))
# target = removed_outlier_data.SalePrice 
# removed_outlier_data = removed_outlier_data.drop(columns=["SalePrice"])

# # Create dummies and remove object features
# data_type = removed_outlier_data.dtypes
# object_features = data_type[data_type==object]
# non_object_features = data_type[data_type!=object]
# object_data = removed_outlier_data[object_features.keys()]
# # Categorical data
# len_train = len(removed_outlier_data)
# dataset = pd.concat(objs=[removed_outlier_data, test_set], axis=0)
# dataset = pd.get_dummies(dataset)
# test_set = copy.copy(dataset[len_train:])
# source = copy.copy(dataset[:len_train])

In [None]:
# Function to remove outlier without object features
def remove_outlier(df_in, col_name):
    q1 = df_in[col_name].quantile(0.25) # Q1
    q3 = df_in[col_name].quantile(0.75) # Q3
    iqr = q3-q1 # Interquartile range
    fence_low  = q1-1.5*iqr
    fence_high = q3+1.5*iqr
    df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
    return df_out

removed_outlier_data = remove_outlier(data_without_high_corr, 'SalePrice') 
target = removed_outlier_data.SalePrice

In [None]:
test_set = pd.read_csv('/share/dutta/eyao/dataset/kaggle/house-prices-advanced-regression-techniques/test.csv') 
ids = test_set.Id
test_set = preprocess_no_obj_feature(test_set, train_data=False)
test_set = test_set.drop(columns=high_corr_features)
# test_set = test_set.drop(columns=neg_corr_features.difference(high_corr_features))
removed_outlier_data = removed_outlier_data.drop(columns=["SalePrice"])
source = removed_outlier_data

---
**3. LINEAR REGRESSION MODEL**

In [None]:
X, y = source, target
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2) 
# Initialization
LR = LinearRegression()
LR.fit(X_train, y_train) 
y_pred = LR.predict(X_val) 
plt.scatter(y_val,y_pred) 
plt.plot([y_val.min(), y_val.max()], [y_pred.min(), y_pred.max()], 'k--', lw=3)
plt.xlabel('y_predicted') 
plt.ylabel('y_val')
plt.title('Linear Regression') 
plt.show()
print("Train R2 Score: {}".format(LR.score(X_train,y_train)))
print("Test R2 Score: {}".format(LR.score(X_val,y_val)))

In [None]:
y_pred_kFolds = cross_val_predict(LR, X.values, y.values, cv = 5)
plt.scatter(y, y_pred_kFolds)
plt.plot([y_val.min(), y_val.max()], [y_pred_kFolds.min(), y_pred_kFolds.max()], 'k--', lw=3)
plt.xlabel('y_Predicted') 
plt.ylabel('y_Test') 
plt.title('Linear Regression with K-Folds') 
plt.show()
cv_r2_scores = cross_val_score(LR, source, target, scoring='r2')
print("Mean 5-Folds R Squared: {}".format(np.mean(cv_r2_scores)))

---
**4. PCA**

In [None]:
pca = PCA(n_components=15)
pca_fit = pca.fit_transform(source)
pca_df = pd.DataFrame(data = pca_fit, columns = ['pca1','pca2','pca3','pca4','pca5',
                                                 'pca6','pca7','pca8','pca9','pca10','pca11',
                                                 'pca12','pca13','pca14','pca15'])

In [None]:
X_pca_train, X_pca_val, y_train_pca, y_val_pca = train_test_split(pca_df, y, test_size=0.2)
LR_pca = LinearRegression()
LR_pca.fit(X_pca_train, y_train_pca)
y_pred_pca = LR_pca.predict(X_pca_val)
plt.scatter(y_val_pca,y_pred_pca) 
plt.plot([y_val_pca.min(), y_val_pca.max()], [y_pred_pca.min(), y_pred_pca.max()], 'k--', lw=3)
plt.xlabel('y_predicted') 
plt.ylabel('y_val') 
plt.title('Linear Regression with PCA') 
plt.show()
print("Train R2 Score: {}".format(LR_pca.score(X_pca_train,y_train_pca)))
print("Test R2 Score: {}".format(LR_pca.score(X_pca_val,y_val_pca)))

PCA does not actually improve the perforrmance.

---
**5. VISUALIZE RESIDUE AND HOMOSCEDASTICITY**

In [None]:
# Calculate residuals
residuals = y_val - y_pred
# Create a residual plot
plt.figure(figsize=(8, 6))
plt.scatter(y_pred, residuals, color='blue')
plt.axhline(y=0, color='red', linestyle='--')  
plt.title('Residual Plot')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.grid(True)
plt.show()

In [None]:
plt.hist(residuals, bins=200) 
plt.title('Distribution of Residuals') 
plt.ylabel('Residuals') 
plt.show()

> Based on two diagram above, we find:
> > 1. Mean of Residual approximately equal to 0. The distribution has the same form as the normal distribution
> > 2. Most data points are distributed around the horizontal axis, but there are a few outlier point.
> > 3. Linear Regression model is acceptable for prediction of SalePrice

---
**6. TRAINING WITH DIFFERENT METHODS**

In [None]:
X, y = source, target 
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2) 

In [None]:
learning_rate = [0.01, 0.02, 0.05, 0.1]  
subsample = [0.5, 0.2, 0.1] 
n_estimators = [100, 500, 1000, 1500] 
max_depth = [None, 3, 6, 9, 12] 

param_grid = {'learning_rate':learning_rate,
              'subsample':subsample,
              'n_estimators':n_estimators,
              'max_depth':max_depth} 

GBR = GradientBoostingRegressor() 
GBR = GridSearchCV(estimator=GBR, param_grid=param_grid, cv=2,n_jobs=-1) 
GBR.fit(X_train, y_train) 
print("Results from Grid Search")
print("\n The best estimator across ALL searched params:\n", GBR.best_estimator_) 
print("\n The best score across ALL searched params:\n", GBR.best_score_) 
print("\n The best parameters across ALL searched params:\n", GBR.best_params_) 

In [None]:
GBR_best = GradientBoostingRegressor(**GBR.best_params_) # train with best parameter
GBR_best.fit(X_train, y_train) 
y_pred = GBR_best.predict(X_val) 
print('\n\nR-squared val set: ')
print(GBR_best.score(X_val, y_val)) 
print('\nMAE val set: ')
print(mean_absolute_error(y_val, y_pred)) 
print('\nMSE val set: ')
print(mean_squared_error(y_val, y_pred)) 

In [None]:
learning_rate = [0.01, 0.02, 0.05, 0.1]  
loss = ['linear', 'square', 'exponential'] 
n_estimators = [25, 50, 100, 120] 

param_grid = {'learning_rate':learning_rate,
              'loss': loss,
              'n_estimators': n_estimators} 

ABR = AdaBoostRegressor() 
ABR = GridSearchCV(estimator=ABR, param_grid=param_grid, cv=2,n_jobs=-1) 
ABR.fit(X_train, y_train) 
print("Results from Grid Search")
print("\n The best estimator across ALL searched params:\n", ABR.best_estimator_) 
print("\n The best score across ALL searched params:\n", ABR.best_score_) 
print("\n The best parameters across ALL searched params:\n", ABR.best_params_) 

In [None]:
ABR_best = AdaBoostRegressor(**ABR.best_params_) # train with best parameter
ABR_best.fit(X_train, y_train) 
y_pred = ABR_best.predict(X_val) 
print('\n\nR-squared val set: ')
print(ABR_best.score(X_val, y_val)) 
print('\nMAE val set: ')
print(mean_absolute_error(y_val, y_pred)) 
print('\nMSE val set: ')
print(mean_squared_error(y_val, y_pred)) 

In [None]:
n_estimators = [50, 100, 150, 300] 
max_depth = [None, 4, 6, 8, 10] 
max_features = ['sqrt', 'log2', None, int, float]
param_grid = {'n_estimators': n_estimators,
              'max_depth': max_depth,
              'max_features': max_features} 

RFR = RandomForestRegressor() 
RFR = GridSearchCV(estimator=RFR, param_grid=param_grid, cv=2,n_jobs=-1) 
RFR.fit(X_train, y_train) 
print("Results from Grid Search")
print("\n The best estimator across ALL searched params:\n", RFR.best_estimator_) 
print("\n The best score across ALL searched params:\n", RFR.best_score_) 
print("\n The best parameters across ALL searched params:\n", RFR.best_params_) 

In [None]:
RFR_best = RandomForestRegressor(**RFR.best_params_) # train with best parameter
RFR_best.fit(X_train, y_train) 
y_pred = RFR_best.predict(X_val) 
print('\n\nR-squared val set: ')
print(RFR_best.score(X_val, y_val)) 
print('\nMAE val set: ')
print(mean_absolute_error(y_val, y_pred)) 
print('\nMSE val set: ')
print(mean_squared_error(y_val, y_pred)) 

In [None]:
n_neighbors = [3, 5, 7, 10, 15]   
weights = ['uniform', 'distance'] 
algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute'] 
leaf_size = [20, 30, 50, 70] 
p = [1, 2, 3]

param_grid = {'n_neighbors':n_neighbors,
              'weights':weights,
              'algorithm':algorithm,
              'leaf_size':leaf_size,
              'p': p} 

KNR = KNeighborsRegressor() 
KNR = GridSearchCV(estimator=KNR, param_grid=param_grid, cv=2,n_jobs=-1) 
KNR.fit(X_train, y_train) 
print("Results from Grid Search")
print("\n The best estimator across ALL searched params:\n", KNR.best_estimator_) 
print("\n The best score across ALL searched params:\n", KNR.best_score_) 
print("\n The best parameters across ALL searched params:\n", KNR.best_params_) 

In [None]:
KNR_best = KNeighborsRegressor(**KNR.best_params_) # train with best parameter
KNR_best.fit(X_train, y_train) 
y_pred = KNR_best.predict(X_val) 
print('\n\nR-squared val set: ')
print(KNR_best.score(X_val, y_val)) 
print('\nMAE val set: ')
print(mean_absolute_error(y_val, y_pred)) 
print('\nMSE val set: ')
print(mean_squared_error(y_val, y_pred)) 

In [None]:
n_estimators = [5, 10, 20, 25] 
max_features  = [0.1, 0.2, 0.3, 0.5, 1.0] 
max_samples = [0.1, 0.2, 0.3, 0.5, 1.0] 

param_grid = {'n_estimators': n_estimators,
              'max_features': max_features,
              'max_samples': max_samples} 

BR = BaggingRegressor() 
BR = GridSearchCV(estimator=BR, param_grid=param_grid, cv=2,n_jobs=-1) 
BR.fit(X_train, y_train) 
print("Results from Grid Search")
print("\n The best estimator across ALL searched params:\n", BR.best_estimator_) 
print("\n The best score across ALL searched params:\n", BR.best_score_) 
print("\n The best parameters across ALL searched params:\n", BR.best_params_) 

In [None]:
BR_best = BaggingRegressor(**BR.best_params_) # train with best parameter
BR_best.fit(X_train, y_train) 
y_pred = BR_best.predict(X_val) 
print('\n\nR-squared val set: ')
print(BR_best.score(X_val, y_val)) 
print('\nMAE val set: ')
print(mean_absolute_error(y_val, y_pred)) 
print('\nMSE val set: ')
print(mean_squared_error(y_val, y_pred)) 

In [None]:
criterion = ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'] 
splitter  = ['best', 'random'] 
max_depth = [None, 4, 6, 8, 10] 

param_grid = {'criterion': criterion,
              'splitter': splitter,
              'max_depth': max_depth} 

DTR = DecisionTreeRegressor() 
DTR = GridSearchCV(estimator=DTR, param_grid=param_grid, cv=2,n_jobs=-1) 
DTR.fit(X_train, y_train) 
print("Results from Grid Search")
print("\n The best estimator across ALL searched params:\n", DTR.best_estimator_) 
print("\n The best score across ALL searched params:\n", DTR.best_score_) 
print("\n The best parameters across ALL searched params:\n", DTR.best_params_) 

In [None]:
DTR_best = DecisionTreeRegressor(**DTR.best_params_) # train with best parameter
DTR_best.fit(X_train, y_train) 
y_pred = DTR_best.predict(X_val) 
print('\n\nR-squared val set: ')
print(DTR_best.score(X_val, y_val)) 
print('\nMAE val set: ')
print(mean_absolute_error(y_val, y_pred)) 
print('\nMSE val set: ')
print(mean_squared_error(y_val, y_pred)) 

In [None]:
learning_rate = [0.01, 0.03, 0.05, 0.07]
max_depth = [5, 6, 7]
subsample = [0.1, 0.2, 0.5, 0.7]

param_grid = {'learning_rate': learning_rate,
              'max_depth': max_depth,
              'subsample':subsample} 

XGB = XGBRegressor() 
XGB = GridSearchCV(estimator=XGB, param_grid=param_grid, cv=2,n_jobs=-1) 
XGB.fit(X_train, y_train) 
print("Results from Grid Search")
print("\n The best estimator across ALL searched params:\n", XGB.best_estimator_) 
print("\n The best score across ALL searched params:\n", XGB.best_score_) 
print("\n The best parameters across ALL searched params:\n", XGB.best_params_) 

In [None]:
XGB_best = XGBRegressor(**XGB.best_params_) # train with best parameter
XGB_best.fit(X_train, y_train) 
y_pred = XGB_best.predict(X_val) 
print('\n\nR-squared val set: ')
print(DTR_best.score(X_val, y_val)) 
print('\nMAE val set: ')
print(mean_absolute_error(y_val, y_pred)) 
print('\nMSE val set: ')
print(mean_squared_error(y_val, y_pred)) 

> **Conclustion:** top 3 models with the highest efficiency based on score of test set.
> > Top 1: GradientBoostingRegressor

> > Top 2: RandomForestRegressor

> > Top 3: BaggingRegressor

---
**7. RETRAIN AND PREDICTION**

In [None]:
X, y = source, target 
X_train, y_train = X, y
X_train.shape 

In [None]:
learning_rate = [0.01, 0.02, 0.05, 0.1]  
subsample = [0.5, 0.2, 0.1] 
n_estimators = [100, 500, 1000, 1500] 
max_depth = [None, 3, 6, 9, 12] 

param_grid = {'learning_rate':learning_rate,
              'subsample':subsample,
              'n_estimators':n_estimators,
              'max_depth':max_depth} 

GBR = GradientBoostingRegressor() 
GBR = GridSearchCV(estimator=GBR, param_grid=param_grid, cv=2,n_jobs=-1) 
GBR.fit(X_train, y_train) 
print("Results from Grid Search")
print("\n The best estimator across ALL searched params:\n", GBR.best_estimator_) 
print("\n The best score across ALL searched params:\n", GBR.best_score_) 
print("\n The best parameters across ALL searched params:\n", GBR.best_params_) 

In [None]:
GBR_best = GradientBoostingRegressor(**GBR.best_params_) # train with best parameter
GBR_best.fit(X_train, y_train) 

In [None]:
y_pred = GBR_best.predict(test_set) 
output = pd.DataFrame({'Id': ids, 'SalePrice': y_pred.squeeze()})
output.head() 

In [None]:
output.to_csv('submission_GBR.csv', index=False)

In [None]:
y_pred_LR = LR.predict(test_set) 
output_LR = pd.DataFrame({'Id': ids, 'SalePrice': y_pred.squeeze()})
output_LR.head() 

In [None]:
output_LR.to_csv('submission_LR.csv', index=False)