# Ames Housing Prices Prediction

Ask a home buyer to describe their dream house, and they probably won't begin with the height of the basement ceiling or the proximity to an east-west railroad.  
But this playground competition's dataset proves that much more influences price negotiations than the number of bedrooms or a white-picket fence.

With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this [competition](https://www.kaggle.com/c/house-prices-advanced-regression-techniques) challenges you to predict the final price of each home.

In this notebook, I plan to show how to clean, explore, analyze and predict home prices based on permanent house qualities. I used only Linearrgression using total home sqrft ratio with overall qaulity of the house 

This notebook uses [data from Kaggle](https://www.kaggle.com/c/dsi-us-4-project-2-regression-challenge/data).

This is the final submitted [submitted  to Kaggle](https://www.kaggle.com/c/dsi-us-4-project-2-regression-challenge/leaderboard). 

##  Library and data Setup 

In [1]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import mpl_toolkits

warnings.filterwarnings('ignore')
import scipy.stats as stats
from scipy.stats import skew,norm
from scipy.stats.stats import pearsonr
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
import copy
# Configure visual settings:
%matplotlib inline
sns.set_style('whitegrid')
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (10.0, 8.0) 
plt.style.use('ggplot')
np.random.seed(2018)
%load_ext autoreload
%autoreload 2
sns.set()

# Data

In [2]:
train = pd.read_csv('./data/train.csv')

In [3]:
test = pd.read_csv('./data/test.csv')

In [4]:
train_obj = train.select_dtypes(exclude=[np.number])
train_num = train.select_dtypes(include=[np.number])
test_obj = test.select_dtypes(exclude=[np.number])
test_num = test.select_dtypes(include=[np.number])

In [5]:
# fill NaNs in numeric columns with mean of column (Train)
train.fillna(train_num.mean(), inplace=True)

In [6]:
# fill NaNs in numeric columns with mean of column (Test)
test.fillna(test_num.mean(), inplace=True)

In [7]:
# Get dummies on categorical columns (Train)
train = pd.concat([train, pd.get_dummies(train_obj)], axis=1)

In [8]:
# Get dummies on categorical columns (Test)
test = pd.concat([test, pd.get_dummies(test_obj)], axis=1)

In [9]:
#Confirm dummies were added
print(train.shape)
print(test.shape)

(2051, 334)
(879, 315)


In [10]:
# Identify Object Columns in Train

train_obj.columns

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType'],
      dtype='object')

In [11]:
# Drop those "object" columns in Train
train.drop(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType'], axis=1, inplace=True)

In [12]:
# Identify Object Columns in Test

test_obj.columns

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType'],
      dtype='object')

In [13]:
# Drop those "object" columns in Test
test.drop(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType'], axis=1, inplace=True)

In [14]:
# Recheck shape of dataset to confirm object columns are dropped
print(train.shape)
print(test.shape)

(2051, 292)
(879, 273)


In [15]:
# Recheck that there are no more null values
train.isnull().sum().sum()

0

In [16]:
extra_train_cols = set(train.columns) - set(test.columns)
extra_train_cols = list(extra_train_cols)
extra_train_cols

['Functional_Sal',
 'RoofMatl_Membran',
 'Condition2_RRAe',
 'Heating_OthW',
 'Neighborhood_Landmrk',
 'Exterior1st_CBlock',
 'MiscFeature_TenC',
 'Utilities_NoSeWa',
 'Neighborhood_GrnHill',
 'Exterior2nd_Stone',
 'HeatingQC_Po',
 'Condition2_Artery',
 'Electrical_Mix',
 'BsmtCond_Ex',
 'Heating_Wall',
 'Functional_Sev',
 'MiscFeature_Elev',
 'Exterior1st_ImStucc',
 'Exterior1st_Stone',
 'GarageQual_Ex',
 'SalePrice',
 'MSZoning_A (agr)',
 'PoolQC_Fa',
 'Condition2_RRNn',
 'Condition2_RRAn',
 'RoofMatl_ClyTile',
 'PoolQC_Gd',
 'BsmtCond_Po']

In [17]:
# Drop those Extra Columns in Train

train.drop(['PoolQC_Gd',
 'Condition2_RRNn',
 'MiscFeature_Elev',
 'Exterior1st_Stone',
 'MSZoning_A (agr)',
 'Utilities_NoSeWa',
 'Condition2_RRAe',
 'Neighborhood_GrnHill',
 'Exterior2nd_Stone',
 'BsmtCond_Ex',
 'Condition2_Artery',
 'GarageQual_Ex',
 'Functional_Sev',
 'PoolQC_Fa',
 'Functional_Sal',
 'BsmtCond_Po',
 'Exterior1st_ImStucc',
 'MiscFeature_TenC',
 'Exterior1st_CBlock',
 'RoofMatl_Membran',
 'Heating_OthW',
 'Condition2_RRAn',
 'Heating_Wall',
 'Electrical_Mix',
 'HeatingQC_Po',
 'RoofMatl_ClyTile',
 'Neighborhood_Landmrk'], axis=1, inplace=True)

In [18]:
extra_test_cols = set(test.columns) - set(train.columns)
extra_test_cols = list(extra_test_cols)
extra_test_cols

['SaleType_VWD',
 'KitchenQual_Po',
 'Heating_Floor',
 'Exterior2nd_PreCast',
 'Exterior2nd_Other',
 'RoofMatl_Metal',
 'RoofMatl_Roll',
 'MasVnrType_CBlock',
 'Exterior1st_PreCast']

In [19]:
# Drop those Extra columns in Test

test.drop(['SaleType_VWD',
 'KitchenQual_Po',
 'Exterior2nd_PreCast',
 'Exterior1st_PreCast',
 'MasVnrType_CBlock',
 'Heating_Floor',
 'RoofMatl_Metal',
 'RoofMatl_Roll',
 'Exterior2nd_Other'], axis=1, inplace=True)

In [20]:
# Check Train Shape

train.shape

(2051, 265)

In [21]:
# Check Test Shape

test.shape

(879, 264)

In [22]:
# Fill New column SalePrice in Test with 0s

test['SalePrice'] = 0

In [23]:
# Recheck Test Shape

test.shape

(879, 265)

In [24]:
# Feature Selection with 10 Features

from sklearn.feature_selection import  SelectKBest, f_regression, f_classif
selector = SelectKBest(score_func=f_regression, k=10)

target = 'SalePrice'
not_target = [x for x in train.columns if x != target]

predictors = not_target

selector.fit(train[predictors], train[target])

best_features = selector.get_support(indices=True)

features = list(train[predictors].columns[selector.get_support(indices = True)])
features

['OverallQual',
 'YearBuilt',
 'TotalBsmtSF',
 '1stFlrSF',
 'GrLivArea',
 'GarageCars',
 'GarageArea',
 'ExterQual_TA',
 'BsmtQual_Ex',
 'KitchenQual_Ex']

In [25]:
# Train-Train-Split on Train Data Set

from sklearn.model_selection import train_test_split, KFold, cross_val_score

target = ['SalePrice']
not_target = ['OverallQual',
 'YearBuilt',
 'TotalBsmtSF',
 '1stFlrSF',
 'GrLivArea',
 'GarageCars',
 'GarageArea',
 'ExterQual_TA',
 'BsmtQual_Ex',
 'KitchenQual_Ex']

X = train[not_target].values
Y = train[target].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42)

In [26]:
# Standardize Data Set

ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

In [27]:
# Fit Linear Regression Model

from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train_scaled, Y_train)
print(f"LinReg Training Score: {lr.score(X_test_scaled, Y_test)}")

LinReg Training Score: 0.8685090741271411


In [28]:
# Now Use the Model on the Test Data Set

target = ['SalePrice']
not_target = ['OverallQual',
 'YearBuilt',
 'TotalBsmtSF',
 '1stFlrSF',
 'GrLivArea',
 'GarageCars',
 'GarageArea',
 'ExterQual_TA',
 'BsmtQual_Ex',
 'KitchenQual_Ex']

X_testvalues = test[not_target].values
Y_testvalues = test[target].values


ss = StandardScaler()
X_testvalues_scaled = ss.fit_transform(X_testvalues)
X_testvalues_scaled = ss.transform(X_testvalues)

yhat_withKbest = lr.predict(X_testvalues_scaled)

In [29]:
# Feature Selection with 1 Features

from sklearn.feature_selection import  SelectKBest, f_regression, f_classif
selector = SelectKBest(score_func=f_regression, k=1)

target = 'SalePrice'
not_target = [x for x in train.columns if x != target]

predictors = not_target

selector.fit(train[predictors], train[target])

best_features = selector.get_support(indices=True)

features = list(train[predictors].columns[selector.get_support(indices = True)])
features

['OverallQual']

In [30]:
# Fit Linear Model on 1 Feature

target = ['SalePrice']
not_target = ['Overall Qual']

X_testvalues = test[not_target].values
Y_testvalues = test[target].values


X = train[not_target].values
Y = train[target].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42)

ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train_scaled, Y_train)
print(f"LinReg Training Score: {lr.score(X_test_scaled, Y_test)}")

KeyError: "['Overall Qual'] not in index"

In [None]:
# Feature Selection with 25 Features

from sklearn.feature_selection import  SelectKBest, f_regression, f_classif
selector = SelectKBest(score_func=f_regression, k=25)

target = 'SalePrice'
not_target = [x for x in train.columns if x != target]

predictors = not_target

selector.fit(train[predictors], train[target])

best_features = selector.get_support(indices=True)

features = list(train[predictors].columns[selector.get_support(indices = True)])
features

In [None]:
# Fit Linear Model on 25 Features

target = ['SalePrice']
not_target = ['Overall Qual',
 'Year Built',
 'Year Remod/Add',
 'Mas Vnr Area',
 'Total Bsmt SF',
 '1st Flr SF',
 'Gr Liv Area',
 'Full Bath',
 'TotRms AbvGrd',
 'Fireplaces',
 'Garage Yr Blt',
 'Garage Cars',
 'Garage Area',
 'Neighborhood_NridgHt',
 'Exter Qual_Ex',
 'Exter Qual_Gd',
 'Exter Qual_TA',
 'Foundation_PConc',
 'Bsmt Qual_Ex',
 'Bsmt Qual_TA',
 'BsmtFin Type 1_GLQ',
 'Heating QC_Ex',
 'Kitchen Qual_Ex',
 'Kitchen Qual_TA',
 'Garage Finish_Unf']

X_testvalues = test[not_target].values
Y_testvalues = test[target].values


X = train[not_target].values
Y = train[target].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42)

ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train_scaled, Y_train)
print(f"LinReg Training Score: {lr.score(X_test_scaled, Y_test)}")

In [None]:
Submission for LinReg Model

submission_yhat_withKbest = pd.DataFrame(data = yhat_withKbest, columns = ['SalePrice'], index=test['Id'])
submission_yhat_withKbest.to_csv('./data/submission_yhat_withKbest.csv')

In [None]:
# Ridge Model

from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV

ridge_alphas = np.logspace(0, 5, 200)

optimal_ridge = RidgeCV(alphas=ridge_alphas, cv=10)
optimal_ridge.fit(X_train_scaled, Y_train)

print(optimal_ridge.alpha_)

ridge = Ridge(alpha=optimal_ridge.alpha_)

ridge_scores = cross_val_score(ridge, X_train_scaled, Y_train, cv=10)

print(ridge_scores)
print(np.mean(ridge_scores))
yhat_optimal_ridge = optimal_ridge.predict(X_testvalues_scaled)

In [None]:
def plot_cv(alphas, cv_means, optimal_alpha, lr_mse, log=False):
    # alphas = list of alphas
    # cv_means = list of CV mean MSE
    # optimal_alpha
    # lr_mse
    fig = plt.figure(figsize=(12,8))
    ax = plt.gca()

    if log:
        ax.semilogx(alphas, cv_means, lw=2)
    else:
        ax.plot(alphas, cv_means, lw=2)
    ax.axvline(optimal_alpha)
    ax.axhline(lr_mse)
    ax.set_xlabel('alpha')
    ax.set_ylabel('Mean Squared Error')

lr_model = LinearRegression()
lr_cv_mean_mse = -cross_val_score(lr_model, X_train_scaled, Y_train, cv=5, scoring='neg_mean_squared_error').mean()
    
plot_cv(optimal_ridge.alphas, ridge_scores, optimal_ridge.alpha_, lr_cv_mean_mse, log=True)

In [None]:
#Submission for Ridge Model

submission_optimal_ridge = pd.DataFrame(data = yhat_optimal_ridge, columns = ['SalePrice'], index=test['Id'])
submission_optimal_ridge.to_csv('./data/submission_optimal_ridge.csv')

In [None]:
# Lasso Model

from sklearn.linear_model import Lasso, LassoCV

optimal_lasso = LassoCV(n_alphas=500, cv=10, verbose=0)
optimal_lasso.fit(X_train_scaled, Y_train)

print(optimal_lasso.alpha_)

lasso = Lasso(alpha=optimal_lasso.alpha_)

lasso_scores = cross_val_score(lasso, X_train_scaled, Y_train, cv=10)

print(lasso_scores)
print(np.mean(lasso_scores))

lasso.fit(X_train_scaled, Y_train)

yhat_optimal_lasso = optimal_lasso.predict(X_testvalues_scaled)

In [None]:
# Submission for Lasso Model

submission_optimal_lasso = pd.DataFrame(data = yhat_optimal_lasso, columns = ['SalePrice'], index=test['Id'])
submission_optimal_lasso.to_csv('./data/submission_optimal_lasso.csv')

In [None]:
# Elastic Net

from sklearn.linear_model import ElasticNet, ElasticNetCV

l1_ratios = np.linspace(0.01, 1.0, 25)

optimal_enet = ElasticNetCV(l1_ratio=l1_ratios, n_alphas=100, cv=10,
                            verbose=0)
optimal_enet.fit(X_train_scaled, Y_train)

print(optimal_enet.alpha_)
print(optimal_enet.l1_ratio_)

enet = ElasticNet(alpha=optimal_enet.alpha_, l1_ratio=optimal_enet.l1_ratio_)

enet_scores = cross_val_score(enet, X_train_scaled, Y_train, cv=10)

print(enet_scores)
print(np.mean(enet_scores))

yhat_optimal_enet = optimal_enet.predict(X_testvalues_scaled)

In [None]:
#Submission for Elastic Net

submission_optimal_enet = pd.DataFrame(data = yhat_optimal_enet, columns = ['SalePrice'], index=test['Id'])
submission_optimal_enet.to_csv('./data/submission_optimal_enet.csv')