#Limiting Features
Here I use the information from Feature_Importances_Testing.ipynb to limit the amount of input features and retrain classifiers.

##Imports

In [112]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#Tools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import PowerTransformer

#Regressors
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

In [88]:
# A custom version of PredictionErrorDisplay 
def make_regression_plot(reg):
  plt.plot(reg.predict(X_train), y_train, 'bo', alpha=0.2, label='Train')
  plt.plot(reg.predict(X_test), y_test, 'ro', alpha=0.8, label='Test')
  plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--')
  plt.xlabel('Predicted')
  plt.ylabel('Actual')
  plt.gca().set_aspect('equal') # square aspect ratio
  plt.legend()
  plt.grid()

##Data Selection

In [89]:
#Import Kathryn's Data
dfTrain = pd.read_csv('https://raw.githubusercontent.com/Kathryn-Hise/Real_Estate_ML_Project/main/train.csv') 
df = pd.get_dummies(dfTrain,dtype=float)
df.dropna(axis=1)
df.loc[:, ~df.columns.isin(['LotFrontage', 'MasVnrArea','GarageYrBlt'])]
df = df.drop(columns=['LotFrontage', 'MasVnrArea','GarageYrBlt'],axis=1)

Edit this section to include or uninclude features

In [90]:
#Important Features according to Random Forest Regression
#---------------------------------------------------------
# include of uninclude features using a '#'


feats = [#'OverallQual',       # hard to quantify
         'GrLivArea',#good
         #'TotalBsmtSF',       # square footage (unecessary?)
         #'BsmtFinSF1',        # ""
         #'2ndFlrSF',          # ""
         #'GarageArea',        # using 'GarageCars' is better
         #'YearRemodAdd',      #
         'YearBuilt',#good
         'GarageCars',#good
         #'1stFlrSF',          # ""
         #'LotShape_Reg',      #??
         'LotArea',#good             
         #'WoodDeckSF',        
         #'Condition1_Feedr',  #??
         #'SaleType_Oth',      #??
         #'Foundation_Wood',   #??
         'FullBath',
         'BedroomAbvGr',
         #'MoSold',
         'HalfBath',
         'SalePrice']#target value

In [91]:
df['FullBath'].describe()

count    1460.000000
mean        1.565068
std         0.550916
min         0.000000
25%         1.000000
50%         2.000000
75%         2.000000
max         3.000000
Name: FullBath, dtype: float64

In [92]:
df['HalfBath'].describe()

count    1460.000000
mean        0.382877
std         0.502885
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         2.000000
Name: HalfBath, dtype: float64

In [93]:
df_new = df[feats]
y = df_new['SalePrice'].to_numpy()
X = df_new.loc[:, df_new.columns!='SalePrice'].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

##Linear Regression

In [94]:
estimators = [('scaler', StandardScaler()), ('reg', LinearRegression() ) ]
lin_pipe = Pipeline(estimators)

lin_pipe.fit(X_train, y_train)

print(f"Train Score:\t{lin_pipe.score(X_train, y_train):0.3f}")
print(f"Test Score:\t{lin_pipe.score(X_test,y_test):0.3f}")
#make_regression_plot(lin_pipe)

Train Score:	0.740
Test Score:	0.618


In [113]:
estimators = [('scaler', PowerTransformer()), ('reg', LinearRegression() ) ]
lin_pipe1 = Pipeline(estimators)

lin_pipe1.fit(X_train, y_train)

print(f"Train Score:\t{lin_pipe1.score(X_train, y_train):0.3f}")
print(f"Test Score:\t{lin_pipe1.score(X_test,y_test):0.3f}")

Train Score:	0.721
Test Score:	0.649


###hide


In [95]:
print(estimators[1][1].coef_)
print(estimators[1][1].intercept_)

[ 55850.23327586  26248.51897928  14318.82575438   7423.30640122
  -5273.74298735 -12522.30314171  -6886.47596717]
180808.89897260274


In [96]:
pre = X[0]
#pre = pre.reshape(1, 4)
print(X[0], 'price', lin_pipe.predict(pre.reshape(1, -1)))


[1710 2003    2 8450    2    3    1] price [217779.14995002]


In [97]:
lin = LinearRegression()
lin.fit(X_train, y_train)
print(f"Train Score:\t{lin_pipe.score(X_train, y_train):0.3f}")
print(f"Test Score:\t{lin_pipe.score(X_test,y_test):0.3f}")

Train Score:	0.740
Test Score:	0.618


##HistGradientBoostingRegressor

In [98]:
estimators = [('scaler', StandardScaler()), ('reg', HistGradientBoostingRegressor() ) ]
hist_pipe = Pipeline(estimators)

hist_pipe.fit(X_train, y_train)

print(f"Train Score:\t{hist_pipe.score(X_train, y_train):0.3f}")
print(f"Test Score:\t{hist_pipe.score(X_test,y_test):0.3f}")
#make_regression_plot(hist_pipe)

Train Score:	0.929
Test Score:	0.793


In [114]:
estimators = [('scaler', PowerTransformer()), ('reg', HistGradientBoostingRegressor() ) ]
hist_pipe1 = Pipeline(estimators)

hist_pipe1.fit(X_train, y_train)

print(f"Train Score:\t{hist_pipe1.score(X_train, y_train):0.3f}")
print(f"Test Score:\t{hist_pipe1.score(X_test,y_test):0.3f}")

Train Score:	0.929
Test Score:	0.793


##Random Forest Regression

In [99]:
estimators = [('scaler', StandardScaler()), ('reg', RandomForestRegressor() ) ]
rfr_pipe = Pipeline(estimators)

rfr_pipe.fit(X_train, y_train)

print(f"Train Score:\t{rfr_pipe.score(X_train, y_train):0.3f}")
print(f"Test Score:\t{rfr_pipe.score(X_test,y_test):0.3f}")
#make_regression_plot(rfr_pipe)

Train Score:	0.972
Test Score:	0.825


In [115]:
estimators = [('scaler', PowerTransformer()), ('reg', RandomForestRegressor() ) ]
rfr_pipe1 = Pipeline(estimators)

rfr_pipe1.fit(X_train, y_train)

print(f"Train Score:\t{rfr_pipe1.score(X_train, y_train):0.3f}")
print(f"Test Score:\t{rfr_pipe1.score(X_test,y_test):0.3f}")

Train Score:	0.972
Test Score:	0.826


#Predictor