# Ames House Price Prediction Using Linear Regression

### Link to the Dataset: [Ames House Prices](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data)

### Importing Libraries

In [2]:
import pandas as pd
from sklearn import preprocessing  
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

### Getting our Data

In [3]:
df = pd.read_csv('train.csv')

In [4]:
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


### Data Preprocessing

In [5]:
label_encoder = preprocessing.LabelEncoder()  # label encoding for columns not with int/float type values

In [6]:
df['MSZoning'] = label_encoder.fit_transform(df['MSZoning'])   # label encoding column - MSZoning for an example 
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,3,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,3,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,3,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,3,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,3,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,3,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,3,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,3,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,3,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [7]:
l = list(zip(df.dtypes, df.columns))  # listing all columns with their datatypes
l

[(dtype('int64'), 'Id'),
 (dtype('int64'), 'MSSubClass'),
 (dtype('int32'), 'MSZoning'),
 (dtype('float64'), 'LotFrontage'),
 (dtype('int64'), 'LotArea'),
 (dtype('O'), 'Street'),
 (dtype('O'), 'Alley'),
 (dtype('O'), 'LotShape'),
 (dtype('O'), 'LandContour'),
 (dtype('O'), 'Utilities'),
 (dtype('O'), 'LotConfig'),
 (dtype('O'), 'LandSlope'),
 (dtype('O'), 'Neighborhood'),
 (dtype('O'), 'Condition1'),
 (dtype('O'), 'Condition2'),
 (dtype('O'), 'BldgType'),
 (dtype('O'), 'HouseStyle'),
 (dtype('int64'), 'OverallQual'),
 (dtype('int64'), 'OverallCond'),
 (dtype('int64'), 'YearBuilt'),
 (dtype('int64'), 'YearRemodAdd'),
 (dtype('O'), 'RoofStyle'),
 (dtype('O'), 'RoofMatl'),
 (dtype('O'), 'Exterior1st'),
 (dtype('O'), 'Exterior2nd'),
 (dtype('O'), 'MasVnrType'),
 (dtype('float64'), 'MasVnrArea'),
 (dtype('O'), 'ExterQual'),
 (dtype('O'), 'ExterCond'),
 (dtype('O'), 'Foundation'),
 (dtype('O'), 'BsmtQual'),
 (dtype('O'), 'BsmtCond'),
 (dtype('O'), 'BsmtExposure'),
 (dtype('O'), 'BsmtFinType

In [8]:
# converting all object type data into string for label encoding
a = ['Street','Alley','LotShape','LandContour','Utilities','LotConfig','LandSlope','Neighborhood','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','MasVnrType','ExterQual','ExterCond','Foundation','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','Heating','HeatingQC','CentralAir','Electrical','KitchenQual','Functional','FireplaceQu','GarageType','GarageFinish','GarageQual','GarageCond','PavedDrive','PoolQC','Fence','MiscFeature','SaleType','SaleCondition']
for i in a:
  df[i] = df[i].astype('|S')
df  

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,3,65.0,8450,b'Pave',b'nan',b'Reg',b'Lvl',b'AllPub',...,0,b'nan',b'nan',b'nan',0,2,2008,b'WD',b'Normal',208500
1,2,20,3,80.0,9600,b'Pave',b'nan',b'Reg',b'Lvl',b'AllPub',...,0,b'nan',b'nan',b'nan',0,5,2007,b'WD',b'Normal',181500
2,3,60,3,68.0,11250,b'Pave',b'nan',b'IR1',b'Lvl',b'AllPub',...,0,b'nan',b'nan',b'nan',0,9,2008,b'WD',b'Normal',223500
3,4,70,3,60.0,9550,b'Pave',b'nan',b'IR1',b'Lvl',b'AllPub',...,0,b'nan',b'nan',b'nan',0,2,2006,b'WD',b'Abnorml',140000
4,5,60,3,84.0,14260,b'Pave',b'nan',b'IR1',b'Lvl',b'AllPub',...,0,b'nan',b'nan',b'nan',0,12,2008,b'WD',b'Normal',250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,3,62.0,7917,b'Pave',b'nan',b'Reg',b'Lvl',b'AllPub',...,0,b'nan',b'nan',b'nan',0,8,2007,b'WD',b'Normal',175000
1456,1457,20,3,85.0,13175,b'Pave',b'nan',b'Reg',b'Lvl',b'AllPub',...,0,b'nan',b'MnPrv',b'nan',0,2,2010,b'WD',b'Normal',210000
1457,1458,70,3,66.0,9042,b'Pave',b'nan',b'Reg',b'Lvl',b'AllPub',...,0,b'nan',b'GdPrv',b'Shed',2500,5,2010,b'WD',b'Normal',266500
1458,1459,20,3,68.0,9717,b'Pave',b'nan',b'Reg',b'Lvl',b'AllPub',...,0,b'nan',b'nan',b'nan',0,4,2010,b'WD',b'Normal',142125


In [9]:
# checking from above and label encoding columns that are required to be encoded
label_object = {}
a = ['Street','Alley','LotShape','LandContour','Utilities','LotConfig','LandSlope','Neighborhood','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','MasVnrType','ExterQual','ExterCond','Foundation','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','Heating','HeatingQC','CentralAir','Electrical','KitchenQual','Functional','FireplaceQu','GarageType','GarageFinish','GarageQual','GarageCond','PavedDrive','PoolQC','Fence','MiscFeature','SaleType','SaleCondition']
for i in a:
  labelencoder = label_encoder
  labelencoder.fit(df[i])
  df[i] = labelencoder.fit_transform(df[i])
  label_object[i] = labelencoder
df # getting final encoded dataframe

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,3,65.0,8450,1,2,3,3,0,...,0,3,4,4,0,2,2008,8,4,208500
1,2,20,3,80.0,9600,1,2,3,3,0,...,0,3,4,4,0,5,2007,8,4,181500
2,3,60,3,68.0,11250,1,2,0,3,0,...,0,3,4,4,0,9,2008,8,4,223500
3,4,70,3,60.0,9550,1,2,0,3,0,...,0,3,4,4,0,2,2006,8,0,140000
4,5,60,3,84.0,14260,1,2,0,3,0,...,0,3,4,4,0,12,2008,8,4,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,3,62.0,7917,1,2,3,3,0,...,0,3,4,4,0,8,2007,8,4,175000
1456,1457,20,3,85.0,13175,1,2,3,3,0,...,0,3,2,4,0,2,2010,8,4,210000
1457,1458,70,3,66.0,9042,1,2,3,3,0,...,0,3,0,2,2500,5,2010,8,4,266500
1458,1459,20,3,68.0,9717,1,2,3,3,0,...,0,3,4,4,0,4,2010,8,4,142125


In [10]:
# checking for columns that may contain NULL values
b = list(zip(df.isnull().any(), df.columns))
b

[(False, 'Id'),
 (False, 'MSSubClass'),
 (False, 'MSZoning'),
 (True, 'LotFrontage'),
 (False, 'LotArea'),
 (False, 'Street'),
 (False, 'Alley'),
 (False, 'LotShape'),
 (False, 'LandContour'),
 (False, 'Utilities'),
 (False, 'LotConfig'),
 (False, 'LandSlope'),
 (False, 'Neighborhood'),
 (False, 'Condition1'),
 (False, 'Condition2'),
 (False, 'BldgType'),
 (False, 'HouseStyle'),
 (False, 'OverallQual'),
 (False, 'OverallCond'),
 (False, 'YearBuilt'),
 (False, 'YearRemodAdd'),
 (False, 'RoofStyle'),
 (False, 'RoofMatl'),
 (False, 'Exterior1st'),
 (False, 'Exterior2nd'),
 (False, 'MasVnrType'),
 (True, 'MasVnrArea'),
 (False, 'ExterQual'),
 (False, 'ExterCond'),
 (False, 'Foundation'),
 (False, 'BsmtQual'),
 (False, 'BsmtCond'),
 (False, 'BsmtExposure'),
 (False, 'BsmtFinType1'),
 (False, 'BsmtFinSF1'),
 (False, 'BsmtFinType2'),
 (False, 'BsmtFinSF2'),
 (False, 'BsmtUnfSF'),
 (False, 'TotalBsmtSF'),
 (False, 'Heating'),
 (False, 'HeatingQC'),
 (False, 'CentralAir'),
 (False, 'Electrical'

In [11]:
# replacing all null values with 0
df = df.fillna(0)
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,3,65.0,8450,1,2,3,3,0,...,0,3,4,4,0,2,2008,8,4,208500
1,2,20,3,80.0,9600,1,2,3,3,0,...,0,3,4,4,0,5,2007,8,4,181500
2,3,60,3,68.0,11250,1,2,0,3,0,...,0,3,4,4,0,9,2008,8,4,223500
3,4,70,3,60.0,9550,1,2,0,3,0,...,0,3,4,4,0,2,2006,8,0,140000
4,5,60,3,84.0,14260,1,2,0,3,0,...,0,3,4,4,0,12,2008,8,4,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,3,62.0,7917,1,2,3,3,0,...,0,3,4,4,0,8,2007,8,4,175000
1456,1457,20,3,85.0,13175,1,2,3,3,0,...,0,3,2,4,0,2,2010,8,4,210000
1457,1458,70,3,66.0,9042,1,2,3,3,0,...,0,3,0,2,2500,5,2010,8,4,266500
1458,1459,20,3,68.0,9717,1,2,3,3,0,...,0,3,4,4,0,4,2010,8,4,142125


In [12]:
# verifying that no null value is present now
df.isnull().any()

Id               False
MSSubClass       False
MSZoning         False
LotFrontage      False
LotArea          False
                 ...  
MoSold           False
YrSold           False
SaleType         False
SaleCondition    False
SalePrice        False
Length: 81, dtype: bool

### Splitting Data for Training and Testing

In [13]:
# splitting data into 70:30 ratio for training and testing
data = df.values
X, y = data[:,:-1], data[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=10)

### Linear Regression Model 

In [14]:
# initialising the model
model = LinearRegression()

In [15]:
# training the model with train data
model.fit(X_train, y_train)

LinearRegression()

In [16]:
# making predictions
y_pred = model.predict(X_test)

In [17]:
# checking accuracy using r2_score
r2_score(y_test,y_pred) 

0.833070538824606

# Predictions are 83.30% accurate.