# House Price Prediction

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import date
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

## Importing the dataset

In [2]:
train_dataset = pd.read_csv('train.csv')
test_dataset = pd.read_csv('test.csv')
test_id = test_dataset['Id']
dataset = pd.concat([train_dataset,test_dataset])
dataset.drop('Id', axis=1, inplace=True)

The ID is unique for each entry and is of no use in the modelling, this is removed in the previous step.

In [3]:
dataset.drop([columns for columns, count in dict(dataset.isna().sum()).items() if float(count/dataset.shape[0]) >= 0.5], axis = 1, inplace=True)

In [4]:
dataset['Utilities'].value_counts() 

AllPub    2916
NoSeWa       1
Name: Utilities, dtype: int64

In [5]:
dataset.drop('Utilities', axis=1, inplace=True)

## Taking care of missing data

In [6]:
for field in ['GarageType', 'GarageFinish','GarageQual', 'GarageCond',
              'BsmtFinType1','BsmtQual','BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
              'BsmtFinType2','FireplaceQu',
               'MasVnrType' ] :
    dataset[field].fillna('None',inplace=True)

In [7]:
for field in ['MasVnrArea','BsmtFullBath','BsmtHalfBath'
              ,'BsmtFinSF1','GarageCars','GarageArea','TotalBsmtSF',
             'BsmtUnfSF','BsmtFinSF2','GarageYrBlt','TotalBsmtSF']:
    dataset[field].fillna(0,inplace=True) 

In [8]:
dataset['LotFrontage'] = dataset.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.mean()))
for feature in ['MSZoning', 'Electrical']:
    dataset[feature] = dataset.groupby('Neighborhood')[feature].transform(lambda x: x.fillna(x.mode()[0]))

for field in ['SaleType','Exterior1st','Exterior2nd',]:
    dataset[field].fillna(dataset[field].mode()[0],inplace=True)
    
dataset.Functional.fillna('Typ',inplace=True)
dataset.KitchenQual.fillna('TA',inplace=True)

## Encoding categorical data

### Encoding the Independent Variables

In [9]:
for field in ['MSSubClass','LandSlope','YearBuilt','YearRemodAdd','CentralAir','GarageYrBlt','PavedDrive','YrSold']:
    le = LabelEncoder()
    dataset[field] = le.fit_transform(dataset[field].values)

In [10]:
ordinal_features = ['ExterQual','LotShape','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1', 'BsmtFinType2',
              'HeatingQC','Functional','FireplaceQu','KitchenQual', 'GarageFinish','GarageQual','GarageCond']

for field in (ordinal_features):
    dataset[field] = dataset[field].astype(str)

    orders=[
    #ExterQual
    ['Po','Fa','TA','Gd','Ex'],
    #LotShape
    ['Reg','IR1' ,'IR2','IR3'],
    #BsmtQual
    ['None','Fa','TA','Gd','Ex'],
    #BsmtCond
    ['None','Po','Fa','TA','Gd','Ex'],
    #BsmtExposure
    ['None','No','Mn','Av','Gd'],
    #BsmtFinType1
    ['None','Unf','LwQ', 'Rec','BLQ','ALQ' , 'GLQ' ],
    #BsmtFinType2
   ['None','Unf','LwQ', 'Rec','BLQ','ALQ' , 'GLQ' ],
    #HeatingQC
    ['Po','Fa','TA','Gd','Ex'],
    #Functional
   ['Sev','Maj2','Maj1','Mod','Min2','Min1','Typ'],
    #FireplaceQu
    ['None','Po','Fa','TA','Gd','Ex'],
    #KitchenQual
    ['Fa','TA','Gd','Ex'],
    #GarageFinish
    ['None','Unf','RFn','Fin'],
    #GarageQual
    ['None','Po','Fa','TA','Gd','Ex'],
    #GarageCond
    ['None','Po','Fa','TA','Gd','Ex']]

#for i in range(len(orders)):
#    dataset.loc[ordinal_features[i]] = OrdinalEncoder(categories = {0:orders[i]}).fit_transform(dataset.loc[ordinal_features[i]].values.reshape(-1,1))
for i in range(len(orders)):
  dataset[ordinal_features[i]] = OrdinalEncoder(categories = {0:orders[i]}).fit_transform(\
                                                                                  dataset[ordinal_features[i]].values.reshape(-1,1))

In [11]:
dataset = pd.get_dummies(dataset)

## Splitting the dataset into the Training set and Test set

In [12]:
train_dataset = dataset[:train_dataset.shape[0]]
test_dataset = dataset[train_dataset.shape[0]:].drop('SalePrice', axis=1)

In [13]:
train_dataset = train_dataset.drop(np.where(train_dataset['GrLivArea']>4000)[0])

get_dummies is used instead of OneHotEncoder to preserve the X as dataframe

In [14]:
X = train_dataset.drop('SalePrice', axis=1).values
y = np.log1p(train_dataset['SalePrice'])

In [15]:
feature_list = list(train_dataset.columns)
feature_list.remove('SalePrice')

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

# Training the Multiple Linear Regression model

In [17]:
X_train

array([[5.0000e+00, 6.5000e+01, 1.2257e+04, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00],
       [1.3000e+01, 2.4000e+01, 1.9500e+03, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00],
       [0.0000e+00, 7.2000e+01, 1.0011e+04, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00],
       ...,
       [4.0000e+00, 5.0000e+01, 6.0000e+03, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00],
       [1.3000e+01, 2.1000e+01, 1.6800e+03, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00],
       [1.0000e+00, 5.0000e+01, 6.0000e+03, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00]])

In [18]:
regressor1 = LinearRegression()
regressor1.fit(X_train, y_train)

LinearRegression()

In [19]:
y_pred1 = regressor1.predict(X_test)

In [20]:
sq_error1 = (y_test-y_pred1)**2

In [21]:
msq_error1 = sq_error1.mean()
msq_error1

77.93563207679517

# Training the Polynomial Regression models

In [22]:
poly_reg2 = PolynomialFeatures(degree = 2)
X_poly2 = poly_reg2.fit_transform(X_train)
lin_reg_2 = LinearRegression()
lin_reg_2.fit(X_poly2, y_train)

LinearRegression()

In [23]:
y_pred2 = lin_reg_2.predict(poly_reg2.fit_transform(X_test))

In [24]:
sq_error2 = (y_test-y_pred2)**2

In [25]:
msq_error2 = sq_error2.mean()
msq_error2

1.2533378982270478

# Training the Decision Tree Regression Model

In [26]:
regressor5 = DecisionTreeRegressor(random_state = 0)
regressor5.fit(X_train, y_train)

DecisionTreeRegressor(random_state=0)

In [27]:
y_pred5 = regressor5.predict(X_test)

In [28]:
sq_error5 = (y_test-y_pred5)**2

In [29]:
msq_error5 = sq_error5.mean()
msq_error5

0.04794947554760251

# Training the Random Forest Regression model

In [30]:
regressor6 = RandomForestRegressor(n_estimators = 100, random_state = 0)
regressor6.fit(X_train, y_train)

RandomForestRegressor(random_state=0)

In [31]:
y_pred6 = regressor6.predict(X_test)

In [32]:
sq_error6 = (y_test-y_pred6)**2

In [33]:
msq_error6 = sq_error6.mean()
msq_error6

0.01827567201737378

In [34]:
importances = list(regressor6.feature_importances_)
feature_importances = [(feature, round(importance, 6)) for feature, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: OverallQual          Importance: 0.535964
Variable: GrLivArea            Importance: 0.12457
Variable: TotalBsmtSF          Importance: 0.043346
Variable: GarageCars           Importance: 0.03467
Variable: GarageArea           Importance: 0.027628
Variable: YearBuilt            Importance: 0.021172
Variable: 1stFlrSF             Importance: 0.020405
Variable: BsmtFinSF1           Importance: 0.018113
Variable: LotArea              Importance: 0.014119
Variable: YearRemodAdd         Importance: 0.01014
Variable: OverallCond          Importance: 0.009744
Variable: LotFrontage          Importance: 0.009221
Variable: GarageYrBlt          Importance: 0.00849
Variable: FireplaceQu          Importance: 0.007921
Variable: CentralAir           Importance: 0.007635
Variable: BsmtFinType1         Importance: 0.005686
Variable: WoodDeckSF           Importance: 0.005465
Variable: BsmtUnfSF            Importance: 0.004815
Variable: OpenPorchSF          Importance: 0.004346
Variable: Garage

## Training Random Forest Regression model with fewer variables

In [35]:
feature_importances = regressor6.feature_importances_[regressor6.feature_importances_ > np.percentile(regressor6.feature_importances_, 80)]
important_features = [feature_list[np.where(regressor6.feature_importances_ == f_i)[0][0]] for f_i in feature_importances]

X_imp = train_dataset.loc[:, important_features].values
test_imp = test_dataset.loc[:, important_features].values
X_train_, X_test_, y_train_, y_test_ = train_test_split(X_imp, y, test_size = 0.2, random_state = 1)
regressor7 = RandomForestRegressor(n_estimators = 100, random_state = 0)
regressor7.fit(X_train_, y_train_)

RandomForestRegressor(random_state=0)

In [36]:
y_pred7 = regressor7.predict(X_test_)

In [37]:
sq_error7 = (y_test-y_pred7)**2

In [38]:
msq_error7 = sq_error7.mean()
msq_error7

0.017809316535872324

## Training simple Linear Regression Models

In [39]:
regressor8 = LinearRegression()
regressor8.fit(X_train[:,6].reshape(-1,1), y_train)

LinearRegression()

In [40]:
y_pred8 = regressor8.predict(X_test[:,6].reshape(-1,1))

In [41]:
sq_error8 = (y_test-y_pred8)**2

In [42]:
msq_error8 = sq_error8.mean()
msq_error8

0.1738702249785379

In [43]:
regressor9 = LinearRegression()
regressor9.fit(X_train[:,0].reshape(-1,1), y_train)

LinearRegression()

In [44]:
y_pred9 = regressor9.predict(X_test[:,0].reshape(-1,1))

In [45]:
sq_error9 = (y_test-y_pred9)**2

In [46]:
msq_error9 = sq_error9.mean()
msq_error9

0.17247078729776535

In [47]:
regressor10 = LinearRegression()
regressor10.fit(X_train[:,17].reshape(-1,1), y_train)

LinearRegression()

In [48]:
y_pred10 = regressor10.predict(X_test[:,17].reshape(-1,1))

In [49]:
sq_error10 = (y_test-y_pred10)**2

In [50]:
msq_error10 = sq_error10.mean()
msq_error10

0.1727054273443944

In [54]:
rf = RandomForestRegressor()
sale_price = rf.fit(X_train_,y_train_)

test = test_imp
sale_price = rf.predict(test)

sale_price = np.exp(np.array(sale_price))

In [55]:
submission = pd.DataFrame(sale_price, columns = ['SalePrice'])
submission = pd.concat([test_id, submission], axis=1)

submission.head()

Unnamed: 0,Id,SalePrice
0,1461,123563.752583
1,1462,150328.386884
2,1463,182119.178453
3,1464,180862.226055
4,1465,196470.445429


In [56]:
submission.to_csv('Housing Prices (Test).csv', index=False)